In [1]:
import numpy as np
import pandas as pd
import time
import os

In [2]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

In [3]:
def loadNLPVectors(filename):
    file = 'nlp_data/' + filename + '.npy'
    return np.load(file)

In [4]:
def loadLabels():
    return loadNLPVectors("labels")

In [5]:
csvFile = "binary_classification"
data = loadCSV(csvFile)

In [6]:
unigram_array = "feature_array_unigram"
bigram_array = "feature_array_bigram"
tfidf_array = "feature_array_tfidf"
wordvec_array = "feature_array_word2vec"
unigram_reduced = "reduced_unigram"
bigram_reduced = "reduced_bigram"
tfidf_reduced = "reduced_tfidf"

# Create Machine Learning Classifiers

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB

In [9]:
def genClassifiers(neg = False, filter = []):
    svm_clf = SVC(probability = True)
    rf_clf = RandomForestClassifier()
    log_clf = LogisticRegression()
    lin_clf = LinearRegression()
    
    classifiers = [rf_clf, log_clf, lin_clf]
    names = ['Random Forest', 'Logistic Regression', 'Linear Regression']
    if not neg:
        classifiers.append(MultinomialNB())
        names.append('Naive Bayes')
        
    classifiers.append(svm_clf)
    names.append('SVM')
    
    filtered_classifiers = []
    filtered_names = []
    for i in range(0, len(classifiers)):
        if i not in filter:
            filtered_classifiers.append(classifiers[i])
            filtered_names.append(names[i])
            
    return filtered_classifiers, filtered_names

In [10]:
def dataSplit(nlp):
    X_train, X_test, y_train, y_test = train_test_split(nlp, 
                                                        labels, 
                                                        test_size = 0.2, 
                                                        random_state = 42, 
                                                        shuffle = True, 
                                                        stratify = labels)
    return X_train, X_test, y_train, y_test

In [11]:
def evaluate(clf_array, clf_names, nlp):
    X_train, X_test, y_train, y_test = dataSplit(nlp)
    for i in range(0, len(clf_array)):
        start = time.time()
        clf_array[i].fit(X_train, y_train)
        end = time.time() - start
        
        accuracy = clf_array[i].score(X_test, y_test)
        
        print(clf_names[i] + ': Done in ' + str(end))
        print(clf_names[i] + ' Accuracy: ' + str(accuracy))
        print(" ")
    return X_test, y_test

# Load NLP Data

In [12]:
unigram = loadNLPVectors(unigram_array)

In [13]:
bigram = loadNLPVectors(bigram_array)

In [14]:
tfidf = loadNLPVectors(tfidf_array)

In [15]:
word2vec = loadNLPVectors(wordvec_array)

In [16]:
reduced_unigram = loadNLPVectors(unigram_reduced)

In [17]:
reduced_bigram = loadNLPVectors(bigram_reduced)

In [18]:
reduced_tfidf = loadNLPVectors(tfidf_reduced)

In [19]:
labels = loadLabels()

In [20]:
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

# Reduced_TFIDF Classifiers

In [21]:
reduced_tfidf_clf, clf_names = genClassifiers(neg = True)

In [22]:
x_rtfidf, y_rtfidf = evaluate(reduced_tfidf_clf, clf_names, reduced_tfidf)

Random Forest: Done in 1.3732631206512451
Random Forest Accuracy: 0.8665897731641676
 
Logistic Regression: Done in 0.28240203857421875
Logistic Regression Accuracy: 0.9015763168012303
 


  linalg.lstsq(X, y)


Linear Regression: Done in 0.337540864944458
Linear Regression Accuracy: 0.4933942699678997
 
SVM: Done in 181.7028021812439
SVM Accuracy: 0.6197616301422529
 


# Reduced_Unigram Classifiers

In [23]:
reduced_unigram_clf, clf_names = genClassifiers(neg = True)

In [24]:
x_runi, y_runi = evaluate(reduced_unigram_clf, clf_names, reduced_unigram)

Random Forest: Done in 1.3829059600830078
Random Forest Accuracy: 0.8738946559015763
 
Logistic Regression: Done in 0.4620938301086426
Logistic Regression Accuracy: 0.9115724721261054
 
Linear Regression: Done in 0.202254056930542
Linear Regression Accuracy: 0.5372764672085395
 
SVM: Done in 105.80853796005249
SVM Accuracy: 0.8746635909265668
 


# Reduced_Bigram Classifiers

In [25]:
reduced_bigram_clf, clf_names = genClassifiers(neg = True)

In [26]:
x_rbig, y_rbig = evaluate(reduced_bigram_clf, clf_names, reduced_bigram)

Random Forest: Done in 2.2315099239349365
Random Forest Accuracy: 0.8489042675893888
 
Logistic Regression: Done in 1.2011420726776123
Logistic Regression Accuracy: 0.9142637447135717
 
Linear Regression: Done in 0.9602310657501221
Linear Regression Accuracy: 0.5708337151892493
 
SVM: Done in 268.7503807544708
SVM Accuracy: 0.8469819300269127
 


# Word2Vec Classifiers

In [27]:
word2vec_clf, clf_names = genClassifiers(neg = True)

In [28]:
x_vec, y_vec = evaluate(word2vec_clf, clf_names, word2vec)

Random Forest: Done in 0.7857472896575928
Random Forest Accuracy: 0.5217224144559784
 
Logistic Regression: Done in 0.05181312561035156
Logistic Regression Accuracy: 0.512879661668589
 
Linear Regression: Done in 0.020402908325195312
Linear Regression Accuracy: -0.005723566339111308
 
SVM: Done in 71.41576504707336
SVM Accuracy: 0.512879661668589
 


# Raw TFIDF Classifier

In [29]:
tfidf_clf, clf_names = genClassifiers(filter = [2, 4])

In [30]:
x_tfidf, y_tfidf = evaluate(tfidf_clf, clf_names, tfidf)

Random Forest: Done in 12.994780778884888
Random Forest Accuracy: 0.9015763168012303
 
Logistic Regression: Done in 0.23184800148010254
Logistic Regression Accuracy: 0.9042675893886967
 
Naive Bayes: Done in 0.5647330284118652
Naive Bayes Accuracy: 0.851595540176855
 


# Raw Unigram Classifier

In [31]:
unigram_clf, clf_names = genClassifiers(filter = [2, 4])

In [32]:
x_uni, y_uni = evaluate(unigram_clf, clf_names, unigram)

Random Forest: Done in 12.31935977935791
Random Forest Accuracy: 0.8965782391387928
 
Logistic Regression: Done in 1.5281178951263428
Logistic Regression Accuracy: 0.9154171472510573
 
Naive Bayes: Done in 1.3956730365753174
Naive Bayes Accuracy: 0.8650519031141869
 


# Raw Bigram Classifier

In [33]:
bigram_clf, clf_names = genClassifiers(filter = [2, 4])

In [34]:
x_big, y_big = evaluate(bigram_clf, clf_names, bigram)

Random Forest: Done in 82.37594294548035
Random Forest Accuracy: 0.8935024990388312
 
Logistic Regression: Done in 29.032280683517456
Logistic Regression Accuracy: 0.9150326797385621
 
Naive Bayes: Done in 28.650110006332397
Naive Bayes Accuracy: 0.8669742406766628
 


# Saving Models

In [35]:
from sklearn.externals import joblib

In [36]:
path = os.getcwd()
folderpath = path + "/models"
os.mkdir(folderpath)

In [37]:
def saveModels(features, feature_names, classifiers, X_array, y_array):
    for i in range(0, len(features)):
        feat = features[i]
        os.mkdir(folderpath + feature_names[i])
        for j in range(0, len(classifiers)):
            clas = feat[j]
            filepath = folderpath + feature_names[i] + classifiers[j] + ".pkl"
            joblib.dump(clas, filepath)
            
            # test to see if models correctly saved
            clas_load = joblib.load(filepath)
            X = X_array[i]
            y = y_array[i]
            
            assert clas.score(X, y) == clas_load.score(X, y)

In [38]:
f_list1 = [reduced_tfidf_clf, reduced_unigram_clf, reduced_bigram_clf, word2vec_clf]
f_list2 = [tfidf_clf, unigram_clf, bigram_clf]

fname_list1 = ["/reduced_tfidf", "/reduced_unigram", "/reduced_bigram", "/word2vec"]
fname_list2 = ["/tfidf", "/unigram", "/bigram"]

c_list1 = ["/rand_forest", "/log_reg", "/lin_reg", "/svm"]
c_list2 = ["/rand_forest", "/log_reg", "/naive_bayes"]

fx_list1 = [x_rtfidf, x_runi, x_rbig, x_vec]
fy_list1 = [y_rtfidf, y_runi, y_rbig, y_vec]

fx_list2 = [x_tfidf, x_uni, x_big]
fy_list2 = [y_tfidf, y_uni, y_big]

In [39]:
saveModels(f_list1, fname_list1, c_list1, fx_list1, fy_list1)
saveModels(f_list2, fname_list2, c_list2, fx_list2, fy_list2)