In [88]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
def lemmatizing_tokenizer(text):
    return [lemmatizer.lemmatize(word) for word in word_tokenize(text)]

stops = sum([lemmatizing_tokenizer(word) for word in stopwords.words('english')], [])

In [100]:
data1 = pd.read_csv(r'C:\datasets\IMDB Dataset.csv')

X1 = data1.review
y1 = data1.sentiment

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=.7, random_state=0)

data2 = pd.read_csv(r'C:\datasets\Spam_SMS.csv')

X2 = data2.Message
y2 = data2.Class

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=.7, random_state=0)

data2 = pd.read_csv(r'C:\datasets\fake_job_postings.csv')

X3 = data2.description.values.astype(str)
y3 = data2.fraudulent

X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=.7, random_state=0)

In [82]:
nb_model1 = make_pipeline(TfidfVectorizer(stop_words=stops, token_pattern = None, tokenizer=lemmatizing_tokenizer), MultinomialNB())
nb_model1.fit(X_train1, y_train1)

nb_model2 = make_pipeline(TfidfVectorizer(stop_words=stops, token_pattern = None, tokenizer=lemmatizing_tokenizer), MultinomialNB())
nb_model2.fit(X_train2, y_train2)

nb_model3 = make_pipeline(TfidfVectorizer(stop_words=stops, token_pattern = None, tokenizer=lemmatizing_tokenizer), MultinomialNB())
nb_model3.fit(X_train3, y_train3)

In [94]:
nb_y_pred1 = nb_model1.predict(X_test1)
print("The accuracy of Naive Bayes on \'IMDB Dataset.csv\' is {}".format(accuracy_score(y_test1, nb_y_pred1)))
print("Classification Report:\n{}".format(classification_report(y_test1, nb_y_pred1)))
print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test1, nb_y_pred1)))

nb_y_pred2 = nb_model2.predict(X_test2)
print("The accuracy of Naive Bayes on \'Spam_SMS.csv\' is {}".format(accuracy_score(y_test2, nb_y_pred2)))
print("Classification Report:\n{}".format(classification_report(y_test2, nb_y_pred2)))
print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test2, nb_y_pred2)))

nb_y_pred3 = nb_model3.predict(X_test3)
print("The accuracy of Naive Bayes on \'fake_job_postings.csv\' is {}".format(accuracy_score(y_test3, nb_y_pred3)))
print("Classification Report:\n{}".format(classification_report(y_test3, nb_y_pred3)))
print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test3, nb_y_pred3)))

The accuracy of Naive Bayes on 'IMDB Dataset.csv' is 0.8545428571428572
Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.89      0.86     17468
    positive       0.88      0.82      0.85     17532

    accuracy                           0.85     35000
   macro avg       0.86      0.85      0.85     35000
weighted avg       0.86      0.85      0.85     35000

Confusion Matrix:
[[15546  1922]
 [ 3169 14363]]

The accuracy of Naive Bayes on 'Spam_SMS.csv' is 0.9256791389031266
Classification Report:
              precision    recall  f1-score   support

         ham       0.92      1.00      0.96      3380
        spam       1.00      0.44      0.62       522

    accuracy                           0.93      3902
   macro avg       0.96      0.72      0.79      3902
weighted avg       0.93      0.93      0.91      3902

Confusion Matrix:
[[3380    0]
 [ 290  232]]

The accuracy of Naive Bayes on 'fake_job_postings.csv' is 0.9527

In [96]:
lr_model1 = make_pipeline(TfidfVectorizer(stop_words=stops, token_pattern = None, tokenizer=lemmatizing_tokenizer), LogisticRegression())
lr_model1.fit(X_train1, y_train1)

lr_model2 = make_pipeline(TfidfVectorizer(stop_words=stops, token_pattern = None, tokenizer=lemmatizing_tokenizer), LogisticRegression())
lr_model2.fit(X_train2, y_train2)

lr_model3 = make_pipeline(TfidfVectorizer(stop_words=stops, token_pattern = None, tokenizer=lemmatizing_tokenizer), LogisticRegression())
lr_model3.fit(X_train3, y_train3)

In [98]:
lr_y_pred1 = lr_model1.predict(X_test1)
print("The accuracy of Logistic Regression on \'IMDB Dataset.csv\' is {}".format(accuracy_score(y_test1, lr_y_pred1)))
print("Classification Report:\n{}".format(classification_report(y_test1, lr_y_pred1)))
print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test1, lr_y_pred1)))

lr_y_pred2 = lr_model2.predict(X_test2)
print("The accuracy of Logistic Regression on \'Spam_SMS.csv\' is {}".format(accuracy_score(y_test2, lr_y_pred2)))
print("Classification Report:\n{}".format(classification_report(y_test2, lr_y_pred2)))
print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test2, lr_y_pred2)))

lr_y_pred3 = lr_model3.predict(X_test3)
print("The accuracy of Logistic Regression on \'fake_job_postings.csv\' is {}".format(accuracy_score(y_test3, lr_y_pred3)))
print("Classification Report:\n{}".format(classification_report(y_test3, lr_y_pred3)))
print("Confusion Matrix:\n{}\n".format(confusion_matrix(y_test3, lr_y_pred3)))

The accuracy of Logistic Regression on 'IMDB Dataset.csv' is 0.8762
Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.86      0.87     17468
    positive       0.86      0.89      0.88     17532

    accuracy                           0.88     35000
   macro avg       0.88      0.88      0.88     35000
weighted avg       0.88      0.88      0.88     35000

Confusion Matrix:
[[15004  2464]
 [ 1869 15663]]

The accuracy of Logistic Regression on 'Spam_SMS.csv' is 0.9359302921578677
Classification Report:
              precision    recall  f1-score   support

         ham       0.93      1.00      0.96      3380
        spam       0.98      0.53      0.69       522

    accuracy                           0.94      3902
   macro avg       0.96      0.77      0.83      3902
weighted avg       0.94      0.94      0.93      3902

Confusion Matrix:
[[3374    6]
 [ 244  278]]

The accuracy of Logistic Regression on 'fake_job_postings.cs