In [2]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Define lemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatizing_tokenizer(text):
    return [lemmatizer.lemmatize(word) for word in word_tokenize(text)]

# Set stop words
stops = sum([lemmatizing_tokenizer(word) for word in stopwords.words('english')], [])

In [4]:
# Specify location of folder containing datasets
datasets_location = r'C:\datasets'

# State file name, message name, and class name for each dataset
datasets = ["IMDB Dataset", "Spam_SMS", "fake_job_postings"]
data_message = ["review", "Message", "description"]
data_class = ["sentiment", "Class", "fraudulent"]

# Assign data lists
data = []
X = []
y = []
X_train = [None] * len(datasets)
X_test = [None] * len(datasets)
y_train = [None] * len(datasets)
y_test = [None] * len(datasets)

# Read in data from datasets and set train/test split
for i, file in enumerate(datasets):
    data.append(pd.read_csv(datasets_location + '\\' + file + ".csv", usecols = [data_message[i], data_class[i]]))

    temp_X = data[i][data_message[i]].values.astype(str)
    X.append(temp_X)
    y.append(data[i][data_class[i]].values.astype(str))

    X_train[i], X_test[i], y_train[i], y_test[i] = train_test_split(X[i], y[i], test_size = 0.7, random_state = 0)

In [6]:
# Assign models
nb_model = [None] * len(datasets)

# Set pipeline for each Naive Bayes Model -- Utilizes word stops and lemmatizer
for i, file in enumerate(datasets):
    nb_model[i] = make_pipeline(TfidfVectorizer(stop_words=stops, token_pattern = None, tokenizer=lemmatizing_tokenizer), MultinomialNB())
    nb_model[i].fit(X_train[i], y_train[i])

In [8]:
# Assign class predictions
nb_y_pred = [None] * len(datasets)

# Test model on test data and output accuracy, classification report, and confusion matrix
for i, file in enumerate(datasets):
    nb_y_pred[i] = nb_model[i].predict(X_test[i])
    print("The accuracy of Naive Bayes on \'" + file + ".csv\' is {}".format(accuracy_score(y_test[i], nb_y_pred[i])))
    print("Classification Report:\n{}".format(classification_report(y_test[i], nb_y_pred[i])))
    print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test[i], nb_y_pred[i])))

The accuracy of Naive Bayes on 'IMDB Dataset.csv' is 0.8545428571428572
Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.89      0.86     17468
    positive       0.88      0.82      0.85     17532

    accuracy                           0.85     35000
   macro avg       0.86      0.85      0.85     35000
weighted avg       0.86      0.85      0.85     35000

Confusion Matrix:
[[15546  1922]
 [ 3169 14363]]

The accuracy of Naive Bayes on 'Spam_SMS.csv' is 0.9256791389031266
Classification Report:
              precision    recall  f1-score   support

         ham       0.92      1.00      0.96      3380
        spam       1.00      0.44      0.62       522

    accuracy                           0.93      3902
   macro avg       0.96      0.72      0.79      3902
weighted avg       0.93      0.93      0.91      3902

Confusion Matrix:
[[3380    0]
 [ 290  232]]

The accuracy of Naive Bayes on 'fake_job_postings.csv' is 0.9527

In [None]:
# Assign models
lr_model = [None] * len(datasets)

# Set pipeline for each Logistic Regression -- Utilizes word stops and lemmatizer
for i, file in enumerate(datasets):
    lr_model[i] = make_pipeline(TfidfVectorizer(stop_words=stops, token_pattern = None, tokenizer=lemmatizing_tokenizer), LogisticRegression())
    lr_model[i].fit(X_train[i], y_train[i])

In [290]:
# Assign class predictions
lr_y_pred = [None] * len(datasets)

# Test model on test data and output accuracy, classification report, and confusion matrix
for i, file in enumerate(datasets):
    lr_y_pred[i] = lr_model[i].predict(X_test[i])
    print("The accuracy of Logistic Regression on \'" + file + ".csv\' is {}".format(accuracy_score(y_test[i], lr_y_pred[i])))
    print("Classification Report:\n{}".format(classification_report(y_test[i], lr_y_pred[i])))
    print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test[i], lr_y_pred[i])))

The accuracy of Logistic Regression on 'IMDB Dataset.csv' is 0.8762
Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.86      0.87     17468
    positive       0.86      0.89      0.88     17532

    accuracy                           0.88     35000
   macro avg       0.88      0.88      0.88     35000
weighted avg       0.88      0.88      0.88     35000

Confusion Matrix:
[[15004  2464]
 [ 1869 15663]]

The accuracy of Logistic Regression on 'Spam_SMS.csv' is 0.9359302921578677
Classification Report:
              precision    recall  f1-score   support

         ham       0.93      1.00      0.96      3380
        spam       0.98      0.53      0.69       522

    accuracy                           0.94      3902
   macro avg       0.96      0.77      0.83      3902
weighted avg       0.94      0.94      0.93      3902

Confusion Matrix:
[[3374    6]
 [ 244  278]]

The accuracy of Logistic Regression on 'fake_job_postings.cs