In [1]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from good_email_processor import preprocess
from sklearn.model_selection import cross_val_score
from pickler import pickling_training_data
from sklearn.linear_model import LogisticRegression
import sys
import numpy as np
import pandas as pd
import pickle as pick
sys.path.append(
    "C:\\Users\\hayde\\OneDrive\\Documents\\Final_Project_497\\code_from_tutorial\\Email-Classification\\")

#some of the model code adapted from https://github.com/MahnoorJaved98/Email-Classification

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hayde\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hayde\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#email processor
labelfile, datafile = pickling_training_data()
features_train, features_test, labels_train, labels_test = preprocess(labelfile, datafile)


In [3]:
#evaluation function

def evaluation(currentModel, modelName, labels_test, isFirst = False):

      pred = currentModel.predict(features_test)

      labelV = np.array([1 if x != 'ham' else 0 for x in labels_test.values])
      print(labelV)

      predV = np.array([1 if x != 'ham' else 0 for x in pred])
      print(predV)

      print("Prediction for element 10th, 26th and 50th are:",
            pred[10], pred[26], pred[50])

      print('Number of events predicted in Spam class is: ',
            sum(currentModel.predict(features_test) == 'spam'))
      
      labelV = labelV.reshape(1, -1)
      predV = predV.reshape(1, -1)

      formatted_accuracy =  format(accuracy_score(labels_test, pred), ".5f")

      print("Accuracy of " + modelName + " Classifier: " + formatted_accuracy)
      
      labelV = labelV.reshape(-1, 1)
      predV = predV.reshape(-1, 1)

      formatted_auc =  format(roc_auc_score(labelV, predV), ".5f")

      formatted_f1 =  format(f1_score(labels_test, pred, pos_label='spam'), ".5f")

      #formatted_cross_val =  format(cross_val_score(currentModel, features_test, labels_test, cv=10), ".5f")
      
      print("AUC Score of " + modelName + " Classifier: " + formatted_auc)
      
      print("F1Score of " + modelName + " Classifier: " + formatted_f1)

      #print("Cross Val Score of "+ modelName + " Classifier: " + formatted_cross_val)

      save_and_write_results(modelName, formatted_accuracy, formatted_auc, formatted_f1, isFirst)

      save_model(currentModel, modelName)
      



In [4]:
#saving model to pickle
def save_model(currentModel, modelName):
    model_fileName = "C:\\Users\\hayde\\OneDrive\\Documents\\Final_Project_497\\final_code\\models\\" + modelName + "_default_params" + ".pkl"
    pick.dump(currentModel, open(model_fileName, 'wb'))

In [5]:
#saving results to csv
def save_and_write_results(currentModel, formatted_accuracy, formatted_auc, formatted_f1, isFirst = False):
      
    data = [currentModel, formatted_accuracy, formatted_auc, formatted_f1]

    columns = ['Model', 'Accuracy', 'AUC', 'F1 Score']

    df = pd.DataFrame([data], columns=columns, index=None)

    print(df.to_string())

    if isFirst is True:
        df.to_csv('results_cleaned_5_percentile_lemmatized.csv', index=None)
    
    else:
        df.to_csv('results_cleaned_5_percentile_lemmatized.csv', mode='a', index=None, header=None)

In [7]:
#Logistic Regression as Baseline
logistic = LogisticRegression().fit(features_train, labels_train)
modelName = "Logistic Regression"
evaluation(logistic, modelName, labels_test, isFirst = True)

[0 0 1 ... 0 0 1]
[0 0 1 ... 0 0 1]
Prediction for element 10th, 26th and 50th are: ham spam spam
Number of events predicted in Spam class is:  947
Accuracy of Logistic Regression Classifier: 0.97701
AUC Score of Logistic Regression Classifier: 0.97652
F1Score of Logistic Regression Classifier: 0.97797
                 Model Accuracy      AUC F1 Score
0  Logistic Regression  0.97701  0.97652  0.97797


In [6]:
#Naive Bayes Training and Eval
naive = GaussianNB()
naive.fit(features_train, labels_train)
modelName = "NaiveBayes"
evaluation(naive, modelName, labels_test)

[0 0 1 ... 0 0 1]
[0 0 1 ... 0 0 1]
Prediction for element 10th, 26th and 50th are: ham spam spam
Number of events predicted in Spam class is:  873
Accuracy of NaiveBayes Classifier: 0.95345
AUC Score of NaiveBayes Classifier: 0.95400
F1Score of NaiveBayes Classifier: 0.95355
        Model Accuracy      AUC F1 Score
0  NaiveBayes  0.95345  0.95400  0.95355


In [8]:
#Random Forest Training and Eval
#good params: n_estimators=50, random_state=1
rand = RandomForestClassifier()
rand.fit(features_train, labels_train)
modelName = "RandomForest"

evaluation(rand, modelName, labels_test)

[0 0 1 ... 0 0 1]
[0 0 1 ... 0 0 1]
Prediction for element 10th, 26th and 50th are: ham spam spam
Number of events predicted in Spam class is:  929
Accuracy of RandomForest Classifier: 0.98037
AUC Score of RandomForest Classifier: 0.98015
F1Score of RandomForest Classifier: 0.98101
          Model Accuracy      AUC F1 Score
0  RandomForest  0.98037  0.98015  0.98101


In [9]:
#SVM Training and Eval
#good params: C=10000, kernel='rbf'
svm = SVC()
svm.fit(features_train, labels_train)
modelName = "SVM"
evaluation(svm, modelName, labels_test)

In [None]:
#Voting Training and Eval
voteclf = VotingClassifier(estimators=[('Random Forest', rand), ('Naive Bayes', naive), ('SVM', svm)], voting='hard')
voteclf = voteclf.fit(features_train, labels_train)
modelName = "VotingClassifier"
evaluation(voteclf, modelName, labels_test)

[0 0 1 ... 0 0 1]
[0 0 1 ... 0 0 1]
Prediction for element 10th, 26th and 50th are: ham spam spam
Number of events predicted in Spam class is:  921
Accuracy of VotingClassifier Classifier: 0.98486
AUC Score of VotingClassifier Classifier: 0.98475
F1Score of VotingClassifier Classifier: 0.98529
              Model Accuracy      AUC F1 Score
0  VotingClassifier  0.98486  0.98475  0.98529


In [None]:
#saving vectorizer and selector

# saveVect = "vectorizer"
# saveSelect = "selector"

# save_model(saveVect, vectorizer)
# save_model(saveSelect, selector)
    