In [None]:
import numpy as np
import pandas as pd
import time
import os

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def loadNLPVectors(filename):
    file = 'nlp_data/' + filename + '.npy'
    return np.load(file)

In [None]:
def loadLabels():
    return loadNLPVectors("labels")

In [None]:
BOTH = 0
TRAIN = 1
TEST = 2
def getData(nlp, tag = 0):
    # 0 = train and test
    # 1 = train only
    # 2 = test only
    X_train, X_test, y_train, y_test = train_test_split(nlp, 
                                                        labels, 
                                                        test_size = 0.2, 
                                                        random_state = 42, 
                                                        shuffle = True, 
                                                        stratify = labels)
    train = [X_train, y_train]
    test = [X_test, y_test]
    if tag == BOTH:
        return train, test
    elif tag == TRAIN:
        return train
    else:
        return test

# Load NLP Data

In [None]:
unigram_array = "feature_array_unigram"
bigram_array = "feature_array_bigram"
tfidf_array = "feature_array_tfidf"
wordvec_array = "feature_array_word2vec"
unigram_reduced = "reduced_unigram"
bigram_reduced = "reduced_bigram"
tfidf_reduced = "reduced_tfidf"

In [None]:
unigram = loadNLPVectors(unigram_array)
bigram = loadNLPVectors(bigram_array)
tfidf = loadNLPVectors(tfidf_array)
word2vec = loadNLPVectors(wordvec_array)
reduced_unigram = loadNLPVectors(unigram_reduced)
reduced_bigram = loadNLPVectors(bigram_reduced)
reduced_tfidf = loadNLPVectors(tfidf_reduced)
labels = loadLabels()

# Create Test Datasets

In [None]:
train_uni, test_uni = getData(unigram)

In [None]:
train_big, test_big = getData(bigram)

In [None]:
train_tfidf, test_tfidf = getData(tfidf)

In [None]:
train_vec, test_vec = getData(word2vec)

In [None]:
train_runi, test_runi = getData(reduced_unigram)

In [None]:
train_rbig, test_rbig = getData(reduced_bigram)

In [None]:
train_rtfidf, test_rtfidf = getData(reduced_tfidf)

# Import Models

In [None]:
from sklearn.externals import joblib

In [None]:
feature_folders = ['unigram', 'bigram', 'tfidf', 'word2vec', 'reduced_unigram', 'reduced_bigram', 'reduced_tfidf']
classifiers = ['rand_forest', 'log_reg', 'lin_reg', 'naive_bayes', 'svm']

In [None]:
def loadModel(nlp_index, clf_index):
    model_path = 'models/' + feature_folders[nlp_index] + '/' + classifiers[clf_index] + '.pkl'
    model = joblib.load(model_path)
    return model

# Hyperparameter Optimization

### Selecting Parameters to Tune for Classifiers

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [None]:
def RandomForestHyperparameters():
    
    n_estimators = [int(x) for x in np.linspace(start = 2, stop = 200, num = 20)]
    max_depth = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
    max_depth.append(None)
    max_features = [None, 'auto', 'sqrt', 'log2']
    min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 100, num = 10)]
    min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 100, num = 10)]
    
    params = {'n_estimators': n_estimators, 
              'max_depth': max_depth,
              'max_features': max_features,
              'min_samples_split': min_samples_split, 
              'min_samples_leaf': min_samples_leaf}
    
    return params

In [None]:
def LogisticRegressionHyperparameters():
    
    penalty = ['l1', 'l2']
    tol = [0.01, 0.001, 0.0001, .00001]
    C = np.logspace(0, 4, 10)
    max_iter = [int(x) for x in np.linspace(start = 50, stop = 200, num = 15)]
    
    params = {'penalty': penalty,
              'tol': tol,
              'C': C,
              'max_iter': max_iter}

    return params

In [None]:
def SupportVectorHyperparameters():
    
    C = np.logspace(0, 4, 10)
    kernel = ['linear', 'rbf', 'poly', 'sigmoid']
    tol = [0.01, 0.001, 0.0001, 0.00001]
    
    params = {'C': C,
              'kernel': kernel,
              'tol': tol}
    
    return params

In [None]:
def NaiveBayesHyperparameters():
    
    alpha = [x for x in range(0, 11)]
    class_prior = [[.1, .9], [.2, .8], [.3, .7], [.4, .6]]
    
    params = {'alpha': alpha,
              'class_prior': class_prior}
    
    return params

### Importing Untuned Models

In [None]:
rf_tfidf = loadModel(2, 0)
rf_tfidf

In [None]:
rf_rtfidf = loadModel(6, 0)
rf_rtfidf

In [None]:
lr_uni = loadModel(0, 1)
lr_uni

In [None]:
lr_big = loadModel(1, 1)
lr_big

In [None]:
lr_rbig = loadModel(5, 1)
lr_rbig

In [None]:
nb_big = loadModel(1, 3)
nb_big

In [None]:
svm_runi = loadModel(4, 4)
svm_runi

### Create Model and Parameter Instances for Tuning

In [None]:
rf_clf = RandomForestClassifier()

In [None]:
lr_clf = LogisticRegression()

In [None]:
nb_clf = MultinomialNB()

In [None]:
svm_clf = SVC()

In [None]:
rf_params = RandomForestHyperparameters()
lr_params = LogisticRegressionHyperparameters()
nb_params = NaiveBayesHyperparameters()
svm_params = SupportVectorHyperparameters()

### Tuning Function

In [None]:
def randGridOptimizer(clf, params, iterations, folds, train):
    
    tuningGrid = RandomizedSearchCV(estimator = clf,
                                    param_distributions = params,
                                    n_iter = iterations,
                                    scoring = 'accuracy',
                                    n_jobs = -1,
                                    cv = folds,
                                    verbose = 1,
                                    random_state = 42)
    
    best_clf = tuningGrid.fit(train[0], train[1])
    
    return tuningGrid

# Create Tuned Models

#### Random Forest -TFIDF

In [None]:
rf_tfidf_grid = randGridOptimizer(rf_clf, rf_params, 30, 10, train_tfidf)

In [None]:
rf_tfidf_tuned = rf_tfidf_grid.best_estimator_
rf_tfidf_tuned

#### Random Forest - Reduced TFIDF

In [None]:
rf_rtfidf_grid = randGridOptimizer(rf_clf, rf_params, 30, 10, train_rtfidf)

In [None]:
rf_rtfidf_tuned = rf_rtfidf_grid.best_estimator_
rf_tfidf_tuned

#### Logistic Regression - Unigram

In [None]:
lr_uni_grid = randGridOptimizer(lr_clf, lr_params, 30, 10, train_uni)

In [None]:
lr_uni_tuned = lr_uni_grid.best_estimator_
lr_uni_tuned

#### Logistic Regression - Bigram

In [None]:
lr_big_grid = randGridOptimizer(lr_clf, lr_params, 30, 10, train_big)

In [None]:
lr_big_tuned = lr_big_grid.best_estimator_
lr_big_tuned

#### Logistic Regression - Reduced Bigram

In [None]:
lr_rbig_grid = randGridOptimizer(lr_clf, lr_params, 30, 10, train_rbig)

In [None]:
lr_rbig_tuned = lr_rbig_grid.best_estimator_
lr_rbig_tuned

#### Naive Bayes - Bigram

In [None]:
nb_big_grid = randGridOptimizer(nb_clf, nb_params, 30, 10, train_big)

In [None]:
nb_big_tuned = nb_big_grid.best_estimator_
nb_big_tuned

#### SVM - Reduced Unigram

In [None]:
svm_runi_grid = randGridOptimizer(svm_clf, svm_params, 10, 5, train_uni)

In [None]:
svm_runi_tuned = svm_runi_grid.best_estimator_
svm_runi_tuned

# Tuned vs Untuned Models

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
def compare(original, tuned, test, clf_name, nlp_name):
    
    X_test = test[0]
    y_test = test[1]
    
    y_pred = original.predict(X_test)
    y_pred_t = tuned.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    acc_t = accuracy_score(y_test, y_pred_t)
    roc_auc_t = roc_auc_score(y_test, y_pred_t)
    f1_t = f1_score(y_test, y_pred_t)
    precision_t = precision_score(y_test, y_pred_t)
    recall_t = recall_score(y_test, y_pred_t)
    
    print('Untuned ' + clf_name + ' - ' + nlp_name + ' Metrics: ')
    print(clf_name + ' Accuracy: ' + str(acc))
    print(clf_name + ' ROC AUC Score: '+ str(roc_auc))
    print(clf_name + ' F Score: ' + str(f1))
    print(clf_name + ' Precision Score: ' + str(precision))
    print(clf_name + ' Recall Score ' + str(recall))
    print(" ")
    
    print('Tuned ' + clf_name + ' - ' + nlp_name + ' Metrics: ')
    print(clf_name + ' Accuracy: ' + str(acc_t))
    print(clf_name + ' ROC AUC Score: '+ str(roc_auc_t))
    print(clf_name + ' F Score: ' + str(f1_t))
    print(clf_name + ' Precision Score: ' + str(precision_t))
    print(clf_name + ' Recall Score ' + str(recall_t))
    print(" ")

### Random Forest TFIDF Comparison

In [None]:
compare(rf_tfidf, rf_tfidf_tuned, test_tfidf, 'Random Forest', 'TFIDF')

### Random Forest Reduced TFIDF Comparison

In [None]:
compare(rf_rtfidf, rf_rtfidf_tuned, test_rtfidf, 'Random Forest', 'Reduced TFIDF')

### Logistic Regression Unigram Comparison

In [None]:
compare(lr_uni, lr_uni_tuned, test_uni, 'Logistic Regression', 'Unigram')

### Logistic Regression Bigram Comparison

In [None]:
compare(lr_big, lr_big_tuned, test_big, 'Logistic Regression', 'Bigram')

### Logistic Regression Reduced Bigram Comparison

In [None]:
compare(lr_rbig, lr_rbig_tuned, test_rbig, 'Logistic Regression', 'Reduced Bigram')

### Naive Bayes Bigram Comparison

In [None]:
compare(nb_big, nb_big_tuned, test_big, 'Naive Bayes', 'Bigram')

### SVM Reduced Unigram Comparison

In [None]:
compare(svm_runi, svm_runi_tuned, test_runi, 'SVM', 'Reduced Unigram')