# Classification Method

For each of the 7 NLP methods:

    1) loadNLPVectors
    
    2) genLabels
    
    3) train_test_split: create X_train, X_test, y_train, y_test

Metrics:

    1) accuracy
    
    2) F-Score (precision, recall?)
    
    3) Area under ROC

For each of the Machine Learning Algorithms for each of the NLP methods:

    1) Import classifier
    
    2) cross_val_score: classifier, X_train, y_train, scoring (multiple metrics), cv = 10, n_jobs = -1
    
    3) average cross_val_score
    
    4) Train classifier on entirety of X_train, y_train
    
    5) Evaluate classifier on X_test, y_test
    
    6) Compare test metric vs cross_val_score metric
    
    7) Save trained model
    
    8) Generate Confusion Matricies and other visualizations if necessary

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

In [3]:
def loadNLPVectors(filename):
    file = 'nlp_data/' + filename + '.npy'
    return np.load(file)

In [4]:
def loadLabels():
    return loadNLPVectors("labels")

In [5]:
csvFile = "binary_classification"
data = loadCSV(csvFile)

In [6]:
unigram_array = "feature_array_unigram"
bigram_array = "feature_array_bigram"
tfidf_array = "feature_array_tfidf"
wordvec_array = "feature_array_word2vec"
unigram_reduced = "reduced_unigram"
bigram_reduced = "reduced_bigram"
tfidf_reduced = "reduced_tfidf"

# Create Machine Learning Classifiers

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB

  from numpy.core.umath_tests import inner1d


In [8]:
svm_clf = SVC(probability = True)
rf_clf = RandomForestClassifier()
log_clf = LogisticRegression()
lin_clf = LinearRegression()
nb_clf = MultinomialNB()

# Load NLP Data

In [9]:
unigram = loadNLPVectors(unigram_array)

In [10]:
bigram = loadNLPVectors(bigram_array)

In [11]:
tfidf = loadNLPVectors(tfidf_array)

In [12]:
word2vec = loadNLPVectors(wordvec_array)

In [13]:
reduced_unigram = loadNLPVectors(unigram_reduced)

In [14]:
reduced_bigram = loadNLPVectors(bigram_reduced)

In [15]:
reduced_tfidf = loadNLPVectors(tfidf_reduced)

In [16]:
labels = loadLabels()

# Cross Validation Function

In [17]:
def crossValidate(classifiers, metric, nlp, folds):
    cv_results = []
    for clf in classifiers:
        clf_cv = cross_val_score(clf, nlp, labels, scoring = metric, cv = folds, n_jobs = -1)
        cv_results.append(clf_cv)
    return cv_results

classifiers - array containing which machine learning classifiers to be cross-validated

metric - string identifying which scoring metric to be used

nlp - feature array indicating which natural language processing schema to be used

folds - number of folds to be cross-validated

In [18]:
clf_array = [svm_clf, rf_clf, log_clf, lin_clf, nb_clf]

In [19]:
accuracy = 'accuracy'
f1_score = 'f1'
roc_auc = 'roc_auc'

# Cross Validation - Accuracy

In [None]:
acc_unigram_results = crossValidate(clf_array, accuracy, unigram, 5)

In [None]:
acc_bigram_results = crossValidate(clf_array, accuracy, bigram, 5)

In [None]:
acc_tfidf_results = crossValidate(clf_array, accuracy, tfidf, 5)

In [None]:
acc_wordvec_results = crossValidate(clf_array, accuracy, word2vec, 5)

In [None]:
acc_red_unigram_results = crossValidate(clf_array, accuracy, reduced_unigram, 5)

In [None]:
acc_red_bigram_results = crossValidate(clf_array, accuracy, reduced_bigram, 5)

In [None]:
acc_red_tfidf_results = crossValidate(clf_array, accuracy, reduced_tfidf, 5)

# Cross Validation - F1 Score

In [None]:
f_unigram_results = crossValidate(clf_array, f1_score, unigram, 5)

In [None]:
f_bigram_results = crossValidate(clf_array, f1_score, bigram, 5)

In [None]:
f_tfidf_results = crossValidate(clf_array, f1_score, tfidf, 5)

In [None]:
f_wordvec_results = crossValidate(clf_array, f1_score, word2vec, 5)

In [None]:
f_red_unigram_results = crossValidate(clf_array, f1_score, reduced_unigram, 5)

In [None]:
f_red_bigram_results = crossValidate(clf_array, f1_score, reduced_bigram, 5)

In [None]:
f_red_tfidf_results = crossValidate(clf_array, f1_score, reduced_tfidf, 5)

# Cross Validation - ROC AUC Score

In [None]:
ra_unigram_results = crossValidate(clf_array, roc_auc, unigram, 5)

In [None]:
ra_bigram_results = crossValidate(clf_array, roc_auc, bigram, 5)

In [None]:
ra_tfidf_results = crossValidate(clf_array, roc_auc, tfidf, 5)

In [None]:
ra_wordvec_results = crossValidate(clf_array, roc_auc, word2vec, 5)

In [None]:
ra_red_unigram_results = crossValidate(clf_array, roc_auc, reduced_unigram, 5)

In [None]:
ra_red_bigram_results = crossValidate(clf_array, roc_auc, reduced_bigram, 5)

In [None]:
ra_red_tfidf_results = crossValidate(clf_array, roc_auc, reduced_tfidf, 5)