In [1]:
import numpy as np
import pandas as pd
import time
import os

In [2]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

In [3]:
def loadNLPVectors(filename):
    file = 'nlp_data/' + filename + '.npy'
    return np.load(file)

In [4]:
def loadLabels():
    return loadNLPVectors("labels")

In [5]:
csvFile = "binary_classification"
data = loadCSV(csvFile)

In [6]:
unigram_array = "feature_array_unigram"
bigram_array = "feature_array_bigram"
tfidf_array = "feature_array_tfidf"
wordvec_array = "feature_array_word2vec"
unigram_reduced = "reduced_unigram"
bigram_reduced = "reduced_bigram"
tfidf_reduced = "reduced_tfidf"

# Create Machine Learning Classifiers

In [7]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB

  from numpy.core.umath_tests import inner1d


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [9]:
def genClassifiers(neg = False, filter = []):
    svm_clf = SVC(probability = True)
    rf_clf = RandomForestClassifier()
    log_clf = LogisticRegression()
    lin_clf = LinearRegression()
    
    classifiers = [rf_clf, log_clf, lin_clf]
    names = ['Random Forest', 'Logistic Regression', 'Linear Regression']
    if not neg:
        classifiers.append(MultinomialNB())
        names.append('Naive Bayes')
        
    classifiers.append(svm_clf)
    names.append('SVM')
    
    filtered_classifiers = []
    filtered_names = []
    for i in range(0, len(classifiers)):
        if i not in filter:
            filtered_classifiers.append(classifiers[i])
            filtered_names.append(names[i])
            
    return filtered_classifiers, filtered_names

In [10]:
def dataSplit(nlp):
    X_train, X_test, y_train, y_test = train_test_split(nlp, 
                                                        labels, 
                                                        test_size = 0.2, 
                                                        random_state = 42, 
                                                        shuffle = True, 
                                                        stratify = labels)
    return X_train, X_test, y_train, y_test

In [11]:
def evaluate(clf_array, clf_names, nlp):
    X_train, X_test, y_train, y_test = dataSplit(nlp)
    for i in range(0, len(clf_array)):
        start = time.time()
        clf_array[i].fit(X_train, y_train)
        end = time.time() - start
        
        y_pred = clf_array[i].predict(X_test)
        
        print(clf_names[i] + ': Completed in ' + str(end) + ' seconds')
        if clf_names[i] == 'Linear Regression':
            r2 = clf_array[i].score(X_test, y_test)
            print(clf_names[i] + ' R^2 Score: ' + str(r2))
        else:
            accuracy = accuracy_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred)
            f_score = f1_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            print(clf_names[i] + ' Accuracy: ' + str(accuracy))
            print(clf_names[i] + ' ROC AUC Score: ' + str(roc_auc))
            print(clf_names[i] + ' F Score: ' + str(f_score))
            print(clf_names[i] + ' Precision: ' + str(precision))
            print(clf_names[i] + ' Recall: ' + str(recall))
        print(" ")
        
    return X_test, y_test

# Load NLP Data

In [12]:
unigram = loadNLPVectors(unigram_array)

In [13]:
bigram = loadNLPVectors(bigram_array)

In [14]:
tfidf = loadNLPVectors(tfidf_array)

In [15]:
word2vec = loadNLPVectors(wordvec_array)

In [16]:
reduced_unigram = loadNLPVectors(unigram_reduced)

In [17]:
reduced_bigram = loadNLPVectors(bigram_reduced)

In [18]:
reduced_tfidf = loadNLPVectors(tfidf_reduced)

In [19]:
labels = loadLabels()

In [20]:
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

# Reduced_TFIDF Classifiers

In [21]:
reduced_tfidf_clf, clf_names = genClassifiers(neg = True)

In [22]:
x_rtfidf, y_rtfidf = evaluate(reduced_tfidf_clf, clf_names, reduced_tfidf)

Random Forest: Completed in 1.3704018592834473 seconds
Random Forest Accuracy: 0.8673587081891581
Random Forest ROC AUC Score: 0.8659922209376764
Random Forest F Score: 0.8565488565488565
Random Forest Precision: 0.9050966608084359
Random Forest Recall: 0.8129439621152328
 
Logistic Regression: Completed in 0.2952888011932373 seconds
Logistic Regression Accuracy: 0.9015763168012303
Logistic Regression ROC AUC Score: 0.8997667701271701
Logistic Regression F Score: 0.8914334181509754
Logistic Regression Precision: 0.9633363886342805
Logistic Regression Recall: 0.829518547750592
 


  linalg.lstsq(X, y)


Linear Regression: Completed in 0.24732017517089844 seconds
Linear Regression R^2 Score: 0.4933942699678997
 
SVM: Completed in 170.67447996139526 seconds
SVM Accuracy: 0.6197616301422529
SVM ROC AUC Score: 0.6097277919840395
SVM F Score: 0.36069812540400775
SVM Precision: 0.9964285714285714
SVM Recall: 0.2202052091554854
 


# Reduced_Unigram Classifiers

In [23]:
reduced_unigram_clf, clf_names = genClassifiers(neg = True)

In [24]:
x_runi, y_runi = evaluate(reduced_unigram_clf, clf_names, reduced_unigram)

Random Forest: Completed in 1.428300142288208 seconds
Random Forest Accuracy: 0.8696655132641292
Random Forest ROC AUC Score: 0.8682807372951251
Random Forest F Score: 0.8589263420724095
Random Forest Precision: 0.9084507042253521
Random Forest Recall: 0.8145224940805051
 
Logistic Regression: Completed in 0.45800113677978516 seconds
Logistic Regression Accuracy: 0.9115724721261054
Logistic Regression ROC AUC Score: 0.9101659706847445
Logistic Regression F Score: 0.9040867389491243
Logistic Regression Precision: 0.9584438549955792
Logistic Regression Recall: 0.8555643251775849
 
Linear Regression: Completed in 0.2010180950164795 seconds
Linear Regression R^2 Score: 0.5372764672085395
 
SVM: Completed in 102.91931104660034 seconds
SVM Accuracy: 0.8746635909265668
SVM ROC AUC Score: 0.8717658731802213
SVM F Score: 0.8551111111111112
SVM Precision: 0.9786368260427264
SVM Recall: 0.7592738752959748
 


# Reduced_Bigram Classifiers

In [25]:
reduced_bigram_clf, clf_names = genClassifiers(neg = True)

In [26]:
x_rbig, y_rbig = evaluate(reduced_bigram_clf, clf_names, reduced_bigram)

Random Forest: Completed in 2.1718571186065674 seconds
Random Forest Accuracy: 0.8612072279892349
Random Forest ROC AUC Score: 0.859717733871817
Random Forest F Score: 0.849143334726285
Random Forest Precision: 0.9023090586145648
Random Forest Recall: 0.8018942383583267
 
Logistic Regression: Completed in 1.2169358730316162 seconds
Logistic Regression Accuracy: 0.9142637447135717
Logistic Regression ROC AUC Score: 0.913186066792965
Logistic Regression F Score: 0.9082682023858494
Logistic Regression Precision: 0.9484536082474226
Logistic Regression Recall: 0.8713496448303079
 
Linear Regression: Completed in 0.8626759052276611 seconds
Linear Regression R^2 Score: 0.5708337151892493
 
SVM: Completed in 262.4953439235687 seconds
SVM Accuracy: 0.8469819300269127
SVM ROC AUC Score: 0.8432333754196304
SVM F Score: 0.8162511542012927
SVM Precision: 0.9833147942157954
SVM Recall: 0.6977111286503551
 


# Word2Vec Classifiers

In [27]:
word2vec_clf, clf_names = genClassifiers(neg = True)

In [28]:
x_vec, y_vec = evaluate(word2vec_clf, clf_names, word2vec)

Random Forest: Completed in 0.8266351222991943 seconds
Random Forest Accuracy: 0.5151864667435602
Random Forest ROC AUC Score: 0.5119212295983027
Random Forest F Score: 0.4362986142154671
Random Forest Precision: 0.5030927835051546
Random Forest Recall: 0.3851617995264404
 
Logistic Regression: Completed in 0.05595803260803223 seconds
Logistic Regression Accuracy: 0.512879661668589
Logistic Regression ROC AUC Score: 0.5
Logistic Regression F Score: 0.0
Logistic Regression Precision: 0.0
Logistic Regression Recall: 0.0
 
Linear Regression: Completed in 0.04768204689025879 seconds
Linear Regression R^2 Score: -0.005723566339111308
 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


SVM: Completed in 71.16608691215515 seconds
SVM Accuracy: 0.512879661668589
SVM ROC AUC Score: 0.5
SVM F Score: 0.0
SVM Precision: 0.0
SVM Recall: 0.0
 


# Raw TFIDF Classifier

In [29]:
tfidf_clf, clf_names = genClassifiers(filter = [2, 4])

In [30]:
x_tfidf, y_tfidf = evaluate(tfidf_clf, clf_names, tfidf)

Random Forest: Completed in 12.608357906341553 seconds
Random Forest Accuracy: 0.8954248366013072
Random Forest ROC AUC Score: 0.8939085114112242
Random Forest F Score: 0.8860971524288107
Random Forest Precision: 0.943800178412132
Random Forest Recall: 0.835043409629045
 
Logistic Regression: Completed in 0.22922515869140625 seconds
Logistic Regression Accuracy: 0.9042675893886967
Logistic Regression ROC AUC Score: 0.9026283030544713
Logistic Regression F Score: 0.895157894736842
Logistic Regression Precision: 0.9593862815884476
Logistic Regression Recall: 0.8389897395422258
 
Naive Bayes: Completed in 0.48565196990966797 seconds
Naive Bayes Accuracy: 0.851595540176855
Naive Bayes ROC AUC Score: 0.8517744876575131
Naive Bayes F Score: 0.849336455893833
Naive Bayes Precision: 0.8401544401544402
Naive Bayes Recall: 0.8587213891081295
 


# Raw Unigram Classifier

In [31]:
unigram_clf, clf_names = genClassifiers(filter = [2, 4])

In [32]:
x_uni, y_uni = evaluate(unigram_clf, clf_names, unigram)

Random Forest: Completed in 12.03729510307312 seconds
Random Forest Accuracy: 0.889273356401384
Random Forest ROC AUC Score: 0.8882286362738127
Random Forest F Score: 0.8817733990147782
Random Forest Precision: 0.9187339606501284
Random Forest Recall: 0.8476716653512234
 
Logistic Regression: Completed in 1.489346981048584 seconds
Logistic Regression Accuracy: 0.9154171472510573
Logistic Regression ROC AUC Score: 0.9144294269597639
Logistic Regression F Score: 0.9098360655737705
Logistic Regression Precision: 0.9462915601023018
Logistic Regression Recall: 0.8760852407261247
 
Naive Bayes: Completed in 1.4319968223571777 seconds
Naive Bayes Accuracy: 0.8650519031141869
Naive Bayes ROC AUC Score: 0.8655470015584157
Naive Bayes F Score: 0.8646355572695719
Naive Bayes Precision: 0.8453996983408748
Naive Bayes Recall: 0.8847671665351223
 


# Raw Bigram Classifier

In [33]:
bigram_clf, clf_names = genClassifiers(filter = [2, 4])

In [34]:
x_big, y_big = evaluate(bigram_clf, clf_names, bigram)

Random Forest: Completed in 84.75579595565796 seconds
Random Forest Accuracy: 0.8904267589388697
Random Forest ROC AUC Score: 0.8889566661026235
Random Forest F Score: 0.880902632678646
Random Forest Precision: 0.9360568383658969
Random Forest Recall: 0.8318863456985004
 
Logistic Regression: Completed in 27.815634965896606 seconds
Logistic Regression Accuracy: 0.9150326797385621
Logistic Regression ROC AUC Score: 0.9139356919803714
Logistic Regression F Score: 0.9090160559901194
Logistic Regression Precision: 0.9500860585197934
Logistic Regression Recall: 0.8713496448303079
 
Naive Bayes: Completed in 28.647745847702026 seconds
Naive Bayes Accuracy: 0.8669742406766628
Naive Bayes ROC AUC Score: 0.8677580112863853
Naive Bayes F Score: 0.8680396643783371
Naive Bayes Precision: 0.8398523985239852
Naive Bayes Recall: 0.8981846882399369
 


# Saving Models

In [35]:
from sklearn.externals import joblib

In [36]:
path = os.getcwd()
folderpath = path + "/models"
os.mkdir(folderpath)

In [37]:
def saveModels(features, feature_names, classifiers, X_array, y_array):
    for i in range(0, len(features)):
        feat = features[i]
        os.mkdir(folderpath + feature_names[i])
        for j in range(0, len(classifiers)):
            clas = feat[j]
            filepath = folderpath + feature_names[i] + classifiers[j] + ".pkl"
            joblib.dump(clas, filepath)
            
            # test to see if models correctly saved
            clas_load = joblib.load(filepath)
            X = X_array[i]
            y = y_array[i]
            
            assert clas.score(X, y) == clas_load.score(X, y)

In [38]:
f_list1 = [reduced_tfidf_clf, reduced_unigram_clf, reduced_bigram_clf, word2vec_clf]
f_list2 = [tfidf_clf, unigram_clf, bigram_clf]

fname_list1 = ["/reduced_tfidf", "/reduced_unigram", "/reduced_bigram", "/word2vec"]
fname_list2 = ["/tfidf", "/unigram", "/bigram"]

c_list1 = ["/rand_forest", "/log_reg", "/lin_reg", "/svm"]
c_list2 = ["/rand_forest", "/log_reg", "/naive_bayes"]

fx_list1 = [x_rtfidf, x_runi, x_rbig, x_vec]
fy_list1 = [y_rtfidf, y_runi, y_rbig, y_vec]

fx_list2 = [x_tfidf, x_uni, x_big]
fy_list2 = [y_tfidf, y_uni, y_big]

In [39]:
saveModels(f_list1, fname_list1, c_list1, fx_list1, fy_list1)
saveModels(f_list2, fname_list2, c_list2, fx_list2, fy_list2)