In [1]:
import datetime
import warnings
from collections import Counter
from io import StringIO
import pandas as pd
import numpy as np
import statsmodels.api as sm
from joblib import dump
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, confusion_matrix, roc_curve, auc, \
    roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import datetime
import logging
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

SCORING = {'accuracy': 'accuracy', 'precision': make_scorer(precision_score), 'recall': make_scorer(recall_score),
           'f1': make_scorer(f1_score),
           'AUC': make_scorer(roc_auc_score)}

In [2]:
dfOriginal = pd.read_excel("C:/Users/chels/Desktop/sampledMessages.xlsx")


In [3]:
label_column = 'label'
message_column = 'new_message1'

def mylog():
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    logfile = "./log/" + str(datetime.datetime.now().month) + "-" + str(datetime.datetime.now().day) + "-" + str(
        datetime.datetime.now().hour) + "-" + str(datetime.datetime.now().minute) + \
              os.path.split(__file__)[-1].split(".")[0] + '.log'
    fileHandler = logging.FileHandler(logfile, mode='w', encoding='UTF-8')
    fileHandler.setLevel(logging.NOTSET)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fileHandler.setFormatter(formatter)
    logger.addHandler(fileHandler)
    return logger

# Define the preprocess_text function to perform text preprocessing steps
def preprocess_text(text):
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]

    # Perform stemming
    stemmer = PorterStemmer()
    stemmed_text = [stemmer.stem(word) for word in filtered_text]

    # Join the processed words back into a single string
    preprocessed_text = ' '.join(stemmed_text)
    
    return preprocessed_text

dfOriginal['new_message1'] = dfOriginal['new_message1'].apply(preprocess_text)
originalTrain, originalTest = train_test_split(dfOriginal, test_size=0.15, random_state=42)
vectorizer = TfidfVectorizer()
X_trainOriginal = vectorizer.fit_transform(originalTrain['new_message1'])
X_testOriginal = vectorizer.transform(originalTest['new_message1'])
y_trainOriginal = originalTrain[label_column]
y_testOriginal = originalTest[label_column]

In [4]:
def get_score_by_grid(grid: GridSearchCV):
    print("GridSearchCV is complate!")
    accuRank = grid.cv_results_['rank_test_accuracy']
    preMean = grid.cv_results_['mean_test_precision']
    bestParam = grid.cv_results_['params']
    bestIndex = grid.best_index_
    i = bestIndex
    rank = 1

    while preMean[i] < 0.5:
        rank += 1
        indx = 0
        if rank > 20:
            break
        for num in accuRank:
            if num == rank:
                i = indx
                break
            indx += 1
    bestIndex = i

    res = "refit by:" + str(grid.refit) + " Parameters: " + str(bestParam[bestIndex])
    #logger.info(res)
   # print(res)
    return bestParam[bestIndex]

def KNNClassifier(trainFeatures, trainLabels):
    model = KNeighborsClassifier()
    fold = KFold(n_splits=10, random_state=5, shuffle=True)
    parameter = {'n_neighbors': np.arange(1, 10, 1),
                 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
                 }
    grid = GridSearchCV(estimator=model, param_grid=parameter, cv=fold,
                        scoring=SCORING, refit="accuracy", n_jobs=25)
    grid.fit(trainFeatures, trainLabels)
    bestParameter = get_score_by_grid(grid)
    print("KNN Best using %s " % (bestParameter))
    model = KNeighborsClassifier(n_neighbors=bestParameter['n_neighbors'], algorithm=bestParameter['algorithm'])
    return model


In [5]:
#Generating model results for each project
KNNModelOriginal = KNNClassifier(X_trainOriginal,y_trainOriginal)
KNNModelOriginal.fit(X_trainOriginal, y_trainOriginal)
predictionsOriginal=KNNModelOriginal.predict(X_testOriginal)
accuracyOriginal = accuracy_score(y_testOriginal, predictionsOriginal)
print("Accuracy Overall for Original Dataset:", accuracyOriginal)

GridSearchCV is complate!
KNN Best using {'algorithm': 'auto', 'n_neighbors': 1} 
Accuracy Overall for Original Dataset: 0.592741935483871


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [6]:
def RFClassifier(trainFeatures, trainLabels):
    model = RandomForestClassifier()
    fold = KFold(n_splits=10, random_state=5, shuffle=True)
    tree_param_grid = {'max_features': [28, 150, 768, 772],
                       'min_samples_split': [i for i in np.arange(7, 16, 1)],
                       'n_estimators': list(range(50, 100, 20))
                       }
    grid = GridSearchCV(estimator=model, param_grid=tree_param_grid,
                        scoring=SCORING, refit="accuracy", n_jobs=25, cv=fold)
    grid.fit(trainFeatures, trainLabels)
    bestParameter = get_score_by_grid(grid)
    print("RForest Best using %s " % (bestParameter))
    model = RandomForestClassifier(max_features=bestParameter['max_features'],
                                   min_samples_split=bestParameter['min_samples_split'],
                                   n_estimators=bestParameter['n_estimators'])
    return model

In [7]:
RandForestModelOriginal = RFClassifier(X_trainOriginal ,y_trainOriginal)
RandForestModelOriginal.fit(X_trainOriginal, y_trainOriginal)
predictionsOriginal=RandForestModelOriginal.predict(X_testOriginal)
accuracyOriginal = accuracy_score(y_testOriginal, predictionsOriginal)
print("Accuracy Overall for Original Dataset:", accuracyOriginal)

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


GridSearchCV is complate!
RForest Best using {'max_features': 28, 'min_samples_split': 7, 'n_estimators': 50} 
Accuracy Overall for Original Dataset: 0.782258064516129


In [8]:
def LRClassifier(trainFeatures, trainLabels):
    parameters = {'C': np.linspace(0.0001, 20, 20),
                  'random_state': np.arange(1, 5),
                  'solver': ["newton-cg", "lbfgs", "liblinear", "sag"],
                  'multi_class': ['ovr'],
                  'dual': [False],
                  'verbose': [False],
                  'max_iter': [500]
                  }
    fold = KFold(n_splits=10, random_state=5, shuffle=True)
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        grid = GridSearchCV(LogisticRegression(), parameters, scoring=SCORING, refit="accuracy", cv=fold, n_jobs=25)
        grid.fit(trainFeatures, trainLabels)
    
    bestParameter = get_score_by_grid(grid)
    print("LR Best: using %s " % (bestParameter))
    model = LogisticRegression(C=bestParameter['C'], random_state=bestParameter['random_state'],
                               solver=bestParameter['solver'], multi_class='ovr', dual=False, verbose=False,
                               max_iter=500)
    return model



In [9]:
LRModelOriginal = LRClassifier(X_trainOriginal, y_trainOriginal)
LRModelOriginal.fit(X_trainOriginal, y_trainOriginal)
predictionsOriginal = LRModelOriginal.predict(X_testOriginal)
accuracyOriginal = accuracy_score(y_testOriginal, predictionsOriginal)
print("Accuracy Overall for Original:", accuracyOriginal)

GridSearchCV is complate!
LR Best: using {'C': 0.0001, 'dual': False, 'max_iter': 500, 'multi_class': 'ovr', 'random_state': 1, 'solver': 'newton-cg', 'verbose': False} 
Accuracy Overall for Original: 0.6209677419354839


In [10]:
import warnings
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score

# Set the maximum number of words to consider in the text
MAX_WORDS = 10000
# Set the maximum length of each input sequence
MAX_SEQUENCE_LENGTH = 100

# Convert the text data into sequences of integers
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(dfOriginal['new_message1'])


sequencesOriginal = tokenizer.texts_to_sequences(dfOriginal['new_message1'])


# Pad the sequences to have the same length
padded_sequencesOriginal = pad_sequences(sequencesOriginal, maxlen=MAX_SEQUENCE_LENGTH)


# Split the data into training and test sets
X_trainOriginal, X_testOriginal, y_trainOriginal, y_testOriginal = train_test_split(
    padded_sequencesOriginal, dfOriginal[label_column], test_size=0.15, random_state=42)


# Define the Bi-LSTM model
modelOriginal = Sequential()
modelOriginal.add(Embedding(MAX_WORDS, 128, input_length=MAX_SEQUENCE_LENGTH))
modelOriginal.add(Bidirectional(LSTM(64, return_sequences=True)))
modelOriginal.add(Bidirectional(LSTM(64)))
modelOriginal.add(Dense(1, activation='sigmoid'))



# Compile the models
modelOriginal.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


# Train the models
modelOriginal.fit(X_trainOriginal, y_trainOriginal, epochs=10, batch_size=32, verbose=2)


# Evaluate the models on the test sets
predictionsOriginal = (modelOriginal.predict(X_testOriginal) > 0.5).astype(int)
accuracyOriginal = accuracy_score(y_testOriginal, predictionsOriginal)




# Display the results
print("Accuracy Overall for Original Dataset:", accuracyOriginal)


Epoch 1/10
44/44 - 10s - loss: -8.6005e-01 - accuracy: 0.0414 - 10s/epoch - 229ms/step
Epoch 2/10
44/44 - 5s - loss: -6.3010e+00 - accuracy: 0.3579 - 5s/epoch - 113ms/step
Epoch 3/10
44/44 - 5s - loss: -1.0977e+01 - accuracy: 0.4343 - 5s/epoch - 112ms/step
Epoch 4/10
44/44 - 5s - loss: -1.5422e+01 - accuracy: 0.5021 - 5s/epoch - 112ms/step
Epoch 5/10
44/44 - 5s - loss: -1.8946e+01 - accuracy: 0.5643 - 5s/epoch - 112ms/step
Epoch 6/10
44/44 - 5s - loss: -2.2763e+01 - accuracy: 0.5064 - 5s/epoch - 112ms/step
Epoch 7/10
44/44 - 5s - loss: -2.4098e+01 - accuracy: 0.4129 - 5s/epoch - 111ms/step
Epoch 8/10
44/44 - 5s - loss: -3.0157e+01 - accuracy: 0.5407 - 5s/epoch - 111ms/step
Epoch 9/10
44/44 - 5s - loss: -3.4462e+01 - accuracy: 0.5343 - 5s/epoch - 112ms/step
Epoch 10/10
44/44 - 5s - loss: -3.8442e+01 - accuracy: 0.5386 - 5s/epoch - 112ms/step
Accuracy Overall for Original Dataset: 0.47580645161290325


In [11]:
import warnings
import logging
import os
import datetime
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split, KFold
from torch.nn import functional as F
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import tensorflow as tf
import tensorflow_addons as tfa
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

MAX_SEQUENCE_LENGTH = 100
LABEL_COLUMN = 'label'
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
input_sequencesOriginal = tokenizer.batch_encode_plus(dfOriginal['new_message1'].tolist(), 
                                                    padding='max_length', 
                                                    truncation=True,
                                                    max_length=MAX_SEQUENCE_LENGTH,
                                                    return_tensors='tf')

input_ids_Original = np.array(input_sequencesOriginal['input_ids'])
X_trainOriginal, X_testOriginal, y_trainOriginal, y_testOriginal = train_test_split(
    input_ids_Original, dfOriginal[LABEL_COLUMN], test_size=0.15, random_state=42)
input_layer = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
bert_output = bert_model(input_layer)[0]
pooling_layer = tf.keras.layers.GlobalAveragePooling1D()(bert_output)
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(pooling_layer)

modelOriginal = tf.keras.Model(inputs=input_layer, outputs=output_layer)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metric = tf.keras.metrics.BinaryAccuracy()

modelOriginal.compile(optimizer=optimizer, loss=loss, metrics=[metric])
modelOriginal.fit(X_trainOriginal, y_trainOriginal, epochs=20, batch_size=32, verbose=2)

predictionsOriginal = (modelOriginal.predict(X_testOriginal) > 0.5).astype(int)
accuracyOriginal = accuracy_score(y_testOriginal, predictionsOriginal)
print("Accuracy Overall for Original Dataset:", accuracyOriginal)


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the l

Epoch 1/20


  output, from_logits = _get_logits(


44/44 - 400s - loss: -1.5258e+00 - binary_accuracy: 0.0964 - 400s/epoch - 9s/step
Epoch 2/20
44/44 - 376s - loss: -5.8663e+00 - binary_accuracy: 0.2571 - 376s/epoch - 9s/step
Epoch 3/20
44/44 - 377s - loss: -8.0767e+00 - binary_accuracy: 0.3250 - 377s/epoch - 9s/step
Epoch 4/20
44/44 - 589s - loss: -8.7038e+00 - binary_accuracy: 0.3650 - 589s/epoch - 13s/step
Epoch 5/20
44/44 - 367s - loss: -8.9668e+00 - binary_accuracy: 0.3243 - 367s/epoch - 8s/step
Epoch 6/20
44/44 - 377s - loss: -8.9828e+00 - binary_accuracy: 0.3364 - 377s/epoch - 9s/step
Epoch 7/20
44/44 - 378s - loss: -9.4204e+00 - binary_accuracy: 0.3950 - 378s/epoch - 9s/step
Epoch 8/20
44/44 - 386s - loss: -9.6109e+00 - binary_accuracy: 0.3607 - 386s/epoch - 9s/step
Epoch 9/20
44/44 - 388s - loss: -9.8830e+00 - binary_accuracy: 0.3871 - 388s/epoch - 9s/step
Epoch 10/20
44/44 - 397s - loss: -1.0186e+01 - binary_accuracy: 0.3621 - 397s/epoch - 9s/step
Epoch 11/20
44/44 - 400s - loss: -1.0738e+01 - binary_accuracy: 0.4036 - 400s/e