In [1]:
import numpy as np
import pandas as pd
import time
import os

In [2]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

In [3]:
def loadNLPVectors(folder, filename):
    file = folder + '/' + filename + '.npy'
    return np.load(file)

In [4]:
def loadData(name):
    X_train = loadNLPVectors('train', name)
    y_train = loadNLPVectors('train', 'labels')
    
    X_test = loadNLPVectors('test', name)
    y_test = loadNLPVectors('test', 'labels')
    
    train = [X_train, y_train]
    test = [X_test, y_test]
    
    return train, test

In [5]:
csvFile = "binary_classification"
data = loadCSV(csvFile)

# Create Machine Learning Classifiers

In [6]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB

  from numpy.core.umath_tests import inner1d


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [8]:
def genClassifiers(filter = []):

    rf_clf = RandomForestClassifier()
    log_clf = LogisticRegression()
    nb_clf = MultinomialNB()
    svm_clf = SVC(probability = True)
    
    classifiers = [rf_clf, log_clf, nb_clf, svm_clf]
    names = ['Random Forest', 'Logistic Regression', 'Naive Bayes', 'SVM']
    
    filtered_classifiers = []
    filtered_names = []
    for i in range(0, len(classifiers)):
        if i not in filter:
            filtered_classifiers.append(classifiers[i])
            filtered_names.append(names[i])
            
    return filtered_classifiers, filtered_names

In [9]:
def evaluate(clf_array, clf_names, train, test):
    X_train = train[0]
    y_train = train[1]
    
    X_test = test[0]
    y_test = test[1]
    
    for i in range(0, len(clf_array)):
        start = time.time()
        clf_array[i].fit(X_train, y_train)
        end = time.time() - start
        
        y_pred = clf_array[i].predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        f_score = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)

        print(clf_names[i] + ': Completed in ' + str(end) + ' seconds')
        print(clf_names[i] + ' Accuracy: ' + str(accuracy))
        print(clf_names[i] + ' ROC AUC Score: ' + str(roc_auc))
        print(clf_names[i] + ' F Score: ' + str(f_score))
        print(clf_names[i] + ' Precision: ' + str(precision))
        print(clf_names[i] + ' Recall: ' + str(recall))
        print(" ")
        
    return X_test, y_test

# Load NLP Training/Testing Data

In [10]:
train_uni, test_uni = loadData('unigram')

In [11]:
train_big, test_big = loadData('bigram')

In [12]:
train_tfidf, test_tfidf = loadData('tfidf')

In [13]:
train_word2vec, test_word2vec = loadData('word2vec')

In [14]:
train_runi, test_runi = loadData('runigram')

In [15]:
train_rbig, test_rbig = loadData('rbigram')

In [16]:
train_rtfidf, test_rtfidf = loadData('rtfidf')

# Train Classifiers

In [17]:
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

## Reduced_TFIDF Classifiers

In [18]:
rtfidf_clf, clf_names = genClassifiers(filter = [2])

In [19]:
x_rtfidf, y_rtfidf = evaluate(rtfidf_clf, clf_names, train_rtfidf, test_rtfidf)

Random Forest: Completed in 1.5711941719055176 seconds
Random Forest Accuracy: 0.6109188773548635
Random Forest ROC AUC Score: 0.6154967109973033
Random Forest F Score: 0.6651224354731965
Random Forest Precision: 0.5726495726495726
Random Forest Recall: 0.7932123125493291
 
Logistic Regression: Completed in 0.3475041389465332 seconds
Logistic Regression Accuracy: 0.6351403306420608
Logistic Regression ROC AUC Score: 0.6361170243607478
Logistic Regression F Score: 0.6428302596913812
Logistic Regression Precision: 0.6143884892086331
Logistic Regression Recall: 0.6740331491712708
 
SVM: Completed in 171.74720120429993 seconds
SVM Accuracy: 0.6312956555171088
SVM ROC AUC Score: 0.621745165302116
SVM F Score: 0.3987460815047022
SVM Precision: 0.9695121951219512
SVM Recall: 0.2509865824782952
 


## Reduced_Unigram Classifiers

In [20]:
runi_clf, clf_names = genClassifiers(filter = [2])

In [21]:
x_runi, y_runi = evaluate(runi_clf, clf_names, train_runi, test_runi)

Random Forest: Completed in 1.3853070735931396 seconds
Random Forest Accuracy: 0.6974240676662822
Random Forest ROC AUC Score: 0.6969952277215773
Random Forest F Score: 0.6865790521704499
Random Forest Precision: 0.6929260450160771
Random Forest Recall: 0.6803472770323599
 
Logistic Regression: Completed in 0.4902491569519043 seconds
Logistic Regression Accuracy: 0.6981930026912726
Logistic Regression ROC AUC Score: 0.6974079061495299
Logistic Regression F Score: 0.6828282828282828
Logistic Regression Precision: 0.6995033112582781
Logistic Regression Recall: 0.6669297553275454
 
SVM: Completed in 105.41624212265015 seconds
SVM Accuracy: 0.7370242214532872
SVM ROC AUC Score: 0.7341540358471119
SVM F Score: 0.6976127320954907
SVM Precision: 0.792964824120603
SVM Recall: 0.6227308602999211
 


## Reduced_Bigram Classifiers

In [22]:
rbig_clf, clf_names = genClassifiers(filter = [2])

In [23]:
x_rbig, y_rbig = evaluate(rbig_clf, clf_names, train_rbig, test_rbig)

Random Forest: Completed in 2.163566827774048 seconds
Random Forest Accuracy: 0.6662821991541714
Random Forest ROC AUC Score: 0.6663975628602432
Random Forest F Score: 0.661993769470405
Random Forest Precision: 0.6533435818601077
Random Forest Recall: 0.6708760852407262
 
Logistic Regression: Completed in 1.2324132919311523 seconds
Logistic Regression Accuracy: 0.6720492118415994
Logistic Regression ROC AUC Score: 0.6715440622230321
Logistic Regression F Score: 0.6594810379241516
Logistic Regression Precision: 0.6672051696284329
Logistic Regression Recall: 0.6519337016574586
 
SVM: Completed in 266.23683619499207 seconds
SVM Accuracy: 0.6916570549788543
SVM ROC AUC Score: 0.6865170414003732
SVM F Score: 0.606090373280943
SVM Precision: 0.8023407022106632
SVM Recall: 0.48697711128650356
 


## Word2Vec Classifiers

In [24]:
vec_clf, clf_names = genClassifiers(filter = [2])

In [25]:
x_vec, y_vec = evaluate(vec_clf, clf_names, train_word2vec, test_word2vec)

Random Forest: Completed in 0.3986940383911133 seconds
Random Forest Accuracy: 0.5974625144175317
Random Forest ROC AUC Score: 0.6004358712514303
Random Forest F Score: 0.6340440405452639
Random Forest Precision: 0.56900878293601
Random Forest Recall: 0.7158642462509865
 
Logistic Regression: Completed in 0.24937009811401367 seconds
Logistic Regression Accuracy: 0.5797770088427527
Logistic Regression ROC AUC Score: 0.5806574810463749
Logistic Regression F Score: 0.5877027536778575
Logistic Regression Precision: 0.5628612716763006
Logistic Regression Recall: 0.6148382004735596
 
SVM: Completed in 102.85331630706787 seconds
SVM Accuracy: 0.5786236063052672
SVM ROC AUC Score: 0.5783438194083701
SVM F Score: 0.5674822415153907
SVM Precision: 0.5674822415153907
SVM Recall: 0.5674822415153907
 


## Raw TFIDF Classifier

In [26]:
tfidf_clf, clf_names = genClassifiers(filter = [3])

In [27]:
x_tfidf, y_tfidf = evaluate(tfidf_clf, clf_names, train_tfidf, test_tfidf)

Random Forest: Completed in 10.230072021484375 seconds
Random Forest Accuracy: 0.9042675893886967
Random Forest ROC AUC Score: 0.9031634537900742
Random Forest F Score: 0.8974886784685056
Random Forest Precision: 0.9380378657487092
Random Forest Recall: 0.8602999210734017
 
Logistic Regression: Completed in 0.16809797286987305 seconds
Logistic Regression Accuracy: 0.9081122645136486
Logistic Regression ROC AUC Score: 0.9066539145581115
Logistic Regression F Score: 0.9001253656498119
Logistic Regression Precision: 0.9564831261101243
Logistic Regression Recall: 0.8500394632991318
 
Naive Bayes: Completed in 0.2786829471588135 seconds
Naive Bayes Accuracy: 0.8565936178392926
Naive Bayes ROC AUC Score: 0.8566470513756539
Naive Bayes F Score: 0.8536681051392702
Naive Bayes Precision: 0.8486739469578783
Naive Bayes Recall: 0.8587213891081295
 


## Raw Unigram Classifier

In [28]:
uni_clf, clf_names = genClassifiers(filter = [3])

In [29]:
x_uni, y_uni = evaluate(uni_clf, clf_names, train_uni, test_uni)

Random Forest: Completed in 8.68398904800415 seconds
Random Forest Accuracy: 0.9023452518262207
Random Forest ROC AUC Score: 0.9017452599667019
Random Forest F Score: 0.8975806451612904
Random Forest Precision: 0.9175597691673537
Random Forest Recall: 0.8784530386740331
 
Logistic Regression: Completed in 0.8870952129364014 seconds
Logistic Regression Accuracy: 0.9158016147635525
Logistic Regression ROC AUC Score: 0.9147844191558522
Logistic Regression F Score: 0.9101354123922856
Logistic Regression Precision: 0.9478632478632478
Logistic Regression Recall: 0.8752959747434885
 
Naive Bayes: Completed in 0.8655989170074463 seconds
Naive Bayes Accuracy: 0.8673587081891581
Naive Bayes ROC AUC Score: 0.8676571343373302
Naive Bayes F Score: 0.8659152739992227
Naive Bayes Precision: 0.8529862174578867
Naive Bayes Recall: 0.8792423046566693
 


## Raw Bigram Classifier

In [30]:
big_clf, clf_names = genClassifiers(filter = [3])

In [31]:
x_big, y_big = evaluate(big_clf, clf_names, train_big, test_big)

Random Forest: Completed in 60.024255990982056 seconds
Random Forest Accuracy: 0.8900422914263745
Random Forest ROC AUC Score: 0.8884827515208458
Random Forest F Score: 0.8800335570469798
Random Forest Precision: 0.9391226499552372
Random Forest Recall: 0.8279400157853196
 
Logistic Regression: Completed in 17.185407161712646 seconds
Logistic Regression Accuracy: 0.9142637447135717
Logistic Regression ROC AUC Score: 0.913186066792965
Logistic Regression F Score: 0.9082682023858494
Logistic Regression Precision: 0.9484536082474226
Logistic Regression Recall: 0.8713496448303079
 
Naive Bayes: Completed in 20.337838172912598 seconds
Naive Bayes Accuracy: 0.8777393310265282
Naive Bayes ROC AUC Score: 0.8779158171506195
Naive Bayes F Score: 0.87578125
Naive Bayes Precision: 0.8669760247486465
Naive Bayes Recall: 0.8847671665351223
 


# Saving Models

In [32]:
from sklearn.externals import joblib

In [33]:
path = os.getcwd()
folderpath = path + "/models"
os.mkdir(folderpath)

In [34]:
def saveModels(features, feature_names, classifiers, X_array, y_array):
    for i in range(0, len(features)):
        feat = features[i]
        os.mkdir(folderpath + feature_names[i])
        for j in range(0, len(classifiers)):
            clas = feat[j]
            filepath = folderpath + feature_names[i] + classifiers[j] + ".pkl"
            joblib.dump(clas, filepath)
            
            # test to see if models correctly saved
            clas_load = joblib.load(filepath)
            X = X_array[i]
            y = y_array[i]
            
            assert clas.score(X, y) == clas_load.score(X, y)

In [35]:
f_list1 = [rtfidf_clf, runi_clf, rbig_clf, vec_clf]
f_list2 = [tfidf_clf, uni_clf, big_clf]

fname_list1 = ["/rtfidf", "/runi", "/rbig", "/word2vec"]
fname_list2 = ["/tfidf", "/uni", "/big"]

c_list1 = ["/rand_forest", "/log_reg", "/svm"]
c_list2 = ["/rand_forest", "/log_reg", "/naive_bayes"]

fx_list1 = [x_rtfidf, x_runi, x_rbig, x_vec]
fy_list1 = [y_rtfidf, y_runi, y_rbig, y_vec]

fx_list2 = [x_tfidf, x_uni, x_big]
fy_list2 = [y_tfidf, y_uni, y_big]

In [36]:
saveModels(f_list1, fname_list1, c_list1, fx_list1, fy_list1)
saveModels(f_list2, fname_list2, c_list2, fx_list2, fy_list2)