In [2]:
import pandas as pd, collections
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Define lemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatizing_tokenizer(text):
    return [lemmatizer.lemmatize(word) for word in word_tokenize(text)]

# Set stop words
stops = sum([lemmatizing_tokenizer(word) for word in stopwords.words('english')], [])

In [4]:
# Specify location of folder containing datasets
datasets_location = r'C:\datasets'

# State file name, message name, and class name for each dataset
datasets = ["IMDB Dataset", "Spam_SMS", "fake_job_postings"]
data_message = ["review", "Message", "description"]
data_class = ["sentiment", "Class", "fraudulent"]
positive_class = ["positive", "ham", "0"]
negative_class = ["negative", "spam", "1"]

# Assign data lists
data = [None] * len(datasets)
X = [None] * len(datasets)
y = [None] * len(datasets)
X_train = [None] * len(datasets)
X_test = [None] * len(datasets)
y_train = [None] * len(datasets)
y_test = [None] * len(datasets)

# Read in data from datasets and set train/test split
for i, file in enumerate(datasets):
    data[i] = pd.read_csv(datasets_location + '\\' + file + ".csv", usecols = [data_message[i], data_class[i]])

    X[i] = data[i][data_message[i]].values.astype(str)
    y[i] = data[i][data_class[i]].values.astype(str)

    X_train[i], X_test[i], y_train[i], y_test[i] = train_test_split(X[i], y[i], test_size = 0.7, random_state = 0)

In [6]:
# Naive Bayes

# Assign models
nb_model = [None] * len(datasets)

# Set pipeline for each Naive Bayes Model -- Utilizes word stops and lemmatizer
for i, file in enumerate(datasets):
    nb_model[i] = make_pipeline(TfidfVectorizer(stop_words=stops, token_pattern = None, tokenizer=lemmatizing_tokenizer), MultinomialNB())
    nb_model[i].fit(X_train[i], y_train[i])

In [8]:
# Assign class predictions
nb_y_pred = [None] * len(datasets)

# Test model on test data and output accuracy, classification report, and confusion matrix
for i, file in enumerate(datasets):
    nb_y_pred[i] = nb_model[i].predict(X_test[i])
    print("The accuracy of Naive Bayes on \'" + file + ".csv\' is {}".format(accuracy_score(y_test[i], nb_y_pred[i])))
    print("Classification Report:\n{}".format(classification_report(y_test[i], nb_y_pred[i])))
    print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test[i], nb_y_pred[i])))

The accuracy of Naive Bayes on 'IMDB Dataset.csv' is 0.8545428571428572
Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.89      0.86     17468
    positive       0.88      0.82      0.85     17532

    accuracy                           0.85     35000
   macro avg       0.86      0.85      0.85     35000
weighted avg       0.86      0.85      0.85     35000

Confusion Matrix:
[[15546  1922]
 [ 3169 14363]]

The accuracy of Naive Bayes on 'Spam_SMS.csv' is 0.9256791389031266
Classification Report:
              precision    recall  f1-score   support

         ham       0.92      1.00      0.96      3380
        spam       1.00      0.44      0.62       522

    accuracy                           0.93      3902
   macro avg       0.96      0.72      0.79      3902
weighted avg       0.93      0.93      0.91      3902

Confusion Matrix:
[[3380    0]
 [ 290  232]]

The accuracy of Naive Bayes on 'fake_job_postings.csv' is 0.9527

In [10]:
# Balanced Logistic Regression

# Assign models
lr_model = [None] * len(datasets)

# Set pipeline for each Logistic Regression -- Utilizes word stops and lemmatizer
# class_weight = 'balanced' ensures classes are weighted inversly propertional to their frequency (balances imbalanced data w/o data loss)
for i, file in enumerate(datasets):
    lr_model[i] = make_pipeline(TfidfVectorizer(stop_words=stops, token_pattern = None, tokenizer=lemmatizing_tokenizer), LogisticRegression(class_weight='balanced'))
    lr_model[i].fit(X_train[i], y_train[i])

In [12]:
# Assign class predictions
lr_y_pred = [None] * len(datasets)

# Test model on test data and output accuracy, classification report, and confusion matrix
for i, file in enumerate(datasets):
    lr_y_pred[i] = lr_model[i].predict(X_test[i])
    print("The accuracy of Logistic Regression on \'" + file + ".csv\' is {}".format(accuracy_score(y_test[i], lr_y_pred[i])))
    print("Classification Report:\n{}".format(classification_report(y_test[i], lr_y_pred[i])))
    print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test[i], lr_y_pred[i])))

The accuracy of Logistic Regression on 'IMDB Dataset.csv' is 0.876
Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.86      0.87     17468
    positive       0.86      0.89      0.88     17532

    accuracy                           0.88     35000
   macro avg       0.88      0.88      0.88     35000
weighted avg       0.88      0.88      0.88     35000

Confusion Matrix:
[[14974  2494]
 [ 1846 15686]]

The accuracy of Logistic Regression on 'Spam_SMS.csv' is 0.9702716555612506
Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.98      0.98      3380
        spam       0.87      0.92      0.89       522

    accuracy                           0.97      3902
   macro avg       0.93      0.95      0.94      3902
weighted avg       0.97      0.97      0.97      3902

Confusion Matrix:
[[3308   72]
 [  44  478]]

The accuracy of Logistic Regression on 'fake_job_postings.csv

In [14]:
# Strong Negative Weight Logistic Regression

# Assign models
lr_model_sn = [None] * len(datasets)

positive_weight = 1
negative_weight = 2

# Set pipeline for each Logistic Regression -- Utilizes word stops and lemmatizer
# Custom class weight mimics 'balanced' except negative class is recieves 2x weight, increasing negative classification accuracy
for i, file in enumerate(datasets):
    counter = collections.Counter(y[i])
    lr_model_sn[i] = make_pipeline(TfidfVectorizer(stop_words=stops, token_pattern = None, tokenizer=lemmatizing_tokenizer), LogisticRegression(class_weight={positive_class[i]:(positive_weight * len(data[i]) / (counter[positive_class[i]])), negative_class[i]:(negative_weight * len(data[i]) / (counter[negative_class[i]]))}))
    lr_model_sn[i].fit(X_train[i], y_train[i])

In [16]:
# Assign class predictions
lr_y_pred_sn = [None] * len(datasets)

# Test model on test data and output accuracy, classification report, and confusion matrix
for i, file in enumerate(datasets):
    lr_y_pred_sn[i] = lr_model_sn[i].predict(X_test[i])
    print("The accuracy of Logistic Regression on \'" + file + ".csv\' is {}".format(accuracy_score(y_test[i], lr_y_pred_sn[i])))
    print("Classification Report:\n{}".format(classification_report(y_test[i], lr_y_pred_sn[i])))
    print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test[i], lr_y_pred_sn[i])))

The accuracy of Logistic Regression on 'IMDB Dataset.csv' is 0.8753714285714286
Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.93      0.88     17468
    positive       0.92      0.82      0.87     17532

    accuracy                           0.88     35000
   macro avg       0.88      0.88      0.88     35000
weighted avg       0.88      0.88      0.88     35000

Confusion Matrix:
[[16209  1259]
 [ 3103 14429]]

The accuracy of Logistic Regression on 'Spam_SMS.csv' is 0.9633521271143004
Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.97      0.98      3380
        spam       0.82      0.93      0.87       522

    accuracy                           0.96      3902
   macro avg       0.91      0.95      0.92      3902
weighted avg       0.97      0.96      0.96      3902

Confusion Matrix:
[[3276  104]
 [  39  483]]

The accuracy of Logistic Regression on 'fake_job