In [None]:
import numpy as np
import pandas as pd
import time
import os

In [None]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

In [None]:
def loadNLPVectors(folder, filename):
    file = folder + '/' + filename + '.npy'
    return np.load(file)

In [None]:
def loadData(name):
    X_train = loadNLPVectors('train', name)
    y_train = loadNLPVectors('train', 'labels')
    
    X_test = loadNLPVectors('test', name)
    y_test = loadNLPVectors('test', 'labels')
    
    train = [X_train, y_train]
    test = [X_test, y_test]
    
    return train, test

In [None]:
csvFile = "binary_classification"
data = loadCSV(csvFile)

# Create Machine Learning Classifiers

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
def genClassifiers(filter = []):

    rf_clf = RandomForestClassifier()
    log_clf = LogisticRegression()
    nb_clf = MultinomialNB()
    svm_clf = SVC(probability = True)
    
    classifiers = [rf_clf, log_clf, nb_clf, svm_clf]
    names = ['Random Forest', 'Logistic Regression', 'Naive Bayes', 'SVM']
    
    filtered_classifiers = []
    filtered_names = []
    for i in range(0, len(classifiers)):
        if i not in filter:
            filtered_classifiers.append(classifiers[i])
            filtered_names.append(names[i])
            
    return filtered_classifiers, filtered_names

In [None]:
def evaluate(clf_array, clf_names, train, test):
    X_train = train[0]
    y_train = train[1]
    
    X_test = test[0]
    y_test = test[1]
    
    for i in range(0, len(clf_array)):
        start = time.time()
        clf_array[i].fit(X_train, y_train)
        end = time.time() - start
        
        y_pred = clf_array[i].predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        f_score = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)

        print(clf_names[i] + ': Completed in ' + str(end) + ' seconds')
        print(clf_names[i] + ' Accuracy: ' + str(accuracy))
        print(clf_names[i] + ' ROC AUC Score: ' + str(roc_auc))
        print(clf_names[i] + ' F Score: ' + str(f_score))
        print(clf_names[i] + ' Precision: ' + str(precision))
        print(clf_names[i] + ' Recall: ' + str(recall))
        print(" ")
        
    return X_test, y_test

# Load NLP Training/Testing Data

In [None]:
train_uni, test_uni = loadData('unigram')

In [None]:
train_big, test_big = loadData('bigram')

In [None]:
train_tfidf, test_tfidf = loadData('tfidf')

In [None]:
train_word2vec, test_word2vec = loadData('word2vec')

In [None]:
train_runi, test_runi = loadData('runigram')

In [None]:
train_rbig, test_rbig = loadData('rbigram')

In [None]:
train_rtfidf, test_rtfidf = loadData('rtfidf')

# Train Classifiers

In [None]:
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

## Reduced_TFIDF Classifiers

In [None]:
rtfidf_clf, clf_names = genClassifiers(filter = [2])

In [None]:
x_rtfidf, y_rtfidf = evaluate(rtfidf_clf, clf_names, train_rtfidf, test_rtfidf)

## Reduced_Unigram Classifiers

In [None]:
runi_clf, clf_names = genClassifiers(filter = [2])

In [None]:
x_runi, y_runi = evaluate(runi_clf, clf_names, train_runi, test_runi)

## Reduced_Bigram Classifiers

In [None]:
rbig_clf, clf_names = genClassifiers(filter = [2])

In [None]:
x_rbig, y_rbig = evaluate(rbig_clf, clf_names, train_rbig, test_rbig)

## Word2Vec Classifiers

In [None]:
vec_clf, clf_names = genClassifiers(filter = [2])

In [None]:
x_vec, y_vec = evaluate(vec_clf, clf_names, train_word2vec, test_word2vec)

## Raw TFIDF Classifier

In [None]:
tfidf_clf, clf_names = genClassifiers(filter = [3])

In [None]:
x_tfidf, y_tfidf = evaluate(tfidf_clf, clf_names, train_tfidf, test_tfidf)

## Raw Unigram Classifier

In [None]:
uni_clf, clf_names = genClassifiers(filter = [3])

In [None]:
x_uni, y_uni = evaluate(uni_clf, clf_names, train_uni, test_uni)

## Raw Bigram Classifier

In [None]:
big_clf, clf_names = genClassifiers(filter = [3])

In [None]:
x_big, y_big = evaluate(big_clf, clf_names, train_big, test_big)

# Saving Models

In [None]:
from sklearn.externals import joblib

In [None]:
path = os.getcwd()
folderpath = path + "/models"
os.mkdir(folderpath)

In [None]:
def saveModels(features, feature_names, classifiers, X_array, y_array):
    for i in range(0, len(features)):
        feat = features[i]
        os.mkdir(folderpath + feature_names[i])
        for j in range(0, len(classifiers)):
            clas = feat[j]
            filepath = folderpath + feature_names[i] + classifiers[j] + ".pkl"
            joblib.dump(clas, filepath)
            
            # test to see if models correctly saved
            clas_load = joblib.load(filepath)
            X = X_array[i]
            y = y_array[i]
            
            assert clas.score(X, y) == clas_load.score(X, y)

In [None]:
f_list1 = [rtfidf_clf, runi_clf, rbig_clf, vec_clf]
f_list2 = [tfidf_clf, uni_clf, big_clf]

fname_list1 = ["/rtfidf", "/runi", "/rbig", "/word2vec"]
fname_list2 = ["/tfidf", "/uni", "/big"]

c_list1 = ["/rand_forest", "/log_reg", "/svm"]
c_list2 = ["/rand_forest", "/log_reg", "/naive_bayes"]

fx_list1 = [x_rtfidf, x_runi, x_rbig, x_vec]
fy_list1 = [y_rtfidf, y_runi, y_rbig, y_vec]

fx_list2 = [x_tfidf, x_uni, x_big]
fy_list2 = [y_tfidf, y_uni, y_big]

In [None]:
saveModels(f_list1, fname_list1, c_list1, fx_list1, fy_list1)
saveModels(f_list2, fname_list2, c_list2, fx_list2, fy_list2)