In [None]:
import numpy as np
import pandas as pd
import time
import os

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def loadNLPVectors(folder, filename):
    file = folder + '/' + filename + '.npy'
    return np.load(file)

In [None]:
def loadData(name):
    X_train = loadNLPVectors('train', name)
    y_train = loadNLPVectors('train', 'labels')
    
    X_test = loadNLPVectors('test', name)
    y_test = loadNLPVectors('test', 'labels')
    
    train = [X_train, y_train]
    test = [X_test, y_test]
    
    return train, test

# Load NLP Training/Test Data

In [None]:
train_uni, test_uni = loadData('unigram')

In [None]:
train_big, test_big = loadData('bigram')

In [None]:
train_tfidf, test_tfidf = loadData('tfidf')

In [None]:
train_vec, test_vec = loadData('word2vec')

In [None]:
train_runi, test_runi = loadData('runigram')

In [None]:
train_rbig, test_rbig = loadData('rbigram')

In [None]:
train_rtfidf, test_rtfidf = loadData('rtfidf')

# Import Models

In [None]:
from sklearn.externals import joblib

In [None]:
def loadModel(folder, clf):
    model_path = 'models/' + folder + '/' + clf + '.pkl'
    model = joblob.load(model_path)
    return model

# Hyperparameter Optimization

### Selecting Parameters to Tune for Classifiers

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [None]:
def RandomForestHyperparameters():
    
    n_estimators = [68, 70, 72, 74, 76]
    max_depth = [30, 40, 50, 60, 70, 80]
    max_depth.append(None)
    max_features = ['auto', 'sqrt', 'log2']
    min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 100, num = 10)]
    min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 100, num = 10)]
    
    params = {'n_estimators': n_estimators, 
              'max_depth': max_depth,
              'max_features': max_features,
              'min_samples_split': min_samples_split, 
              'min_samples_leaf': min_samples_leaf}
    
    return params

In [None]:
def LogisticRegressionHyperparameters():
    
    penalty = ['l1', 'l2']
    tol = [0.01, 0.001, 0.0001, .00001]
    C = np.logspace(0, 4, 10)
    max_iter = [int(x) for x in np.linspace(start = 50, stop = 200, num = 15)]
    
    params = {'penalty': penalty,
              'tol': tol,
              'C': C,
              'max_iter': max_iter}

    return params

In [None]:
def SupportVectorHyperparameters():
    
    C = np.logspace(0, 4, 10)
    kernel = ['linear', 'rbf', 'poly', 'sigmoid']
    tol = [0.01, 0.001, 0.0001, 0.00001]
    
    params = {'C': C,
              'kernel': kernel,
              'tol': tol}
    
    return params

In [None]:
def NaiveBayesHyperparameters():
    
    alpha = [x for x in range(0, 11)]
    class_prior = [[.1, .9], [.2, .8], [.3, .7], [.4, .6]]
    
    params = {'alpha': alpha,
              'class_prior': class_prior}
    
    return params

### Importing Untuned Models

In [None]:
rf_tfidf = loadModel('tfidf', 'rand_forest')
rf_tfidf

In [None]:
rf_rtfidf = loadModel('rtfidf', 'rand_forest')
rf_rtfidf

In [None]:
lr_uni = loadModel('uni', 'log_reg')
lr_uni

In [None]:
lr_big = loadModel('big', 'log_reg')
lr_big

In [None]:
lr_rbig = loadModel('rbig', 'log_reg')
lr_rbig

In [None]:
nb_big = loadModel('big', 'naive_bayes')
nb_big

### Create Model and Parameter Instances for Tuning

In [None]:
rf_clf = RandomForestClassifier()

In [None]:
lr_clf = LogisticRegression()

In [None]:
nb_clf = MultinomialNB()

In [None]:
rf_params = RandomForestHyperparameters()
lr_params = LogisticRegressionHyperparameters()
nb_params = NaiveBayesHyperparameters()

### Tuning Function

In [None]:
def randGridOptimizer(clf, params, iterations, folds, train):
    
    tuningGrid = RandomizedSearchCV(estimator = clf,
                                    param_distributions = params,
                                    n_iter = iterations,
                                    scoring = 'accuracy',
                                    n_jobs = -1,
                                    cv = folds,
                                    verbose = 2,
                                    random_state = 42)
    
    best_clf = tuningGrid.fit(train[0], train[1])
    
    return tuningGrid

# Create Tuned Models

#### Random Forest -TFIDF

In [None]:
rf_tfidf_grid = randGridOptimizer(rf_clf, rf_params, 30, 10, train_tfidf)

In [None]:
rf_tfidf_tuned = rf_tfidf_grid.best_estimator_
rf_tfidf_tuned

#### Random Forest - Reduced TFIDF

In [None]:
rf_rtfidf_grid = randGridOptimizer(rf_clf, rf_params, 30, 10, train_rtfidf)

In [None]:
rf_rtfidf_tuned = rf_rtfidf_grid.best_estimator_
rf_tfidf_tuned

#### Logistic Regression - Unigram

In [None]:
lr_uni_grid = randGridOptimizer(lr_clf, lr_params, 30, 10, train_uni)

In [None]:
lr_uni_tuned = lr_uni_grid.best_estimator_
lr_uni_tuned

#### Logistic Regression - Bigram

In [None]:
lr_big_grid = randGridOptimizer(lr_clf, lr_params, 30, 10, train_big)

In [None]:
lr_big_tuned = lr_big_grid.best_estimator_
lr_big_tuned

#### Logistic Regression - Reduced Bigram

In [None]:
lr_rbig_grid = randGridOptimizer(lr_clf, lr_params, 30, 10, train_rbig)

In [None]:
lr_rbig_tuned = lr_rbig_grid.best_estimator_
lr_rbig_tuned

#### Naive Bayes - Bigram

In [None]:
nb_big_grid = randGridOptimizer(nb_clf, nb_params, 30, 10, train_big)

In [None]:
nb_big_tuned = nb_big_grid.best_estimator_
nb_big_tuned

# Tuned vs Untuned Models

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
def compare(original, tuned, test, clf_name, nlp_name):
    
    X_test = test[0]
    y_test = test[1]
    
    y_pred = original.predict(X_test)
    y_pred_t = tuned.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    acc_t = accuracy_score(y_test, y_pred_t)
    roc_auc_t = roc_auc_score(y_test, y_pred_t)
    f1_t = f1_score(y_test, y_pred_t)
    precision_t = precision_score(y_test, y_pred_t)
    recall_t = recall_score(y_test, y_pred_t)
    
    print('Untuned ' + clf_name + ' - ' + nlp_name + ' Metrics: ')
    print(clf_name + ' Accuracy: ' + str(acc))
    print(clf_name + ' ROC AUC Score: '+ str(roc_auc))
    print(clf_name + ' F Score: ' + str(f1))
    print(clf_name + ' Precision Score: ' + str(precision))
    print(clf_name + ' Recall Score ' + str(recall))
    print(" ")
    
    print('Tuned ' + clf_name + ' - ' + nlp_name + ' Metrics: ')
    print(clf_name + ' Accuracy: ' + str(acc_t))
    print(clf_name + ' ROC AUC Score: '+ str(roc_auc_t))
    print(clf_name + ' F Score: ' + str(f1_t))
    print(clf_name + ' Precision Score: ' + str(precision_t))
    print(clf_name + ' Recall Score ' + str(recall_t))
    print(" ")

### Random Forest TFIDF Comparison

In [None]:
compare(rf_tfidf, rf_tfidf_tuned, test_tfidf, 'Random Forest', 'TFIDF')

### Random Forest Reduced TFIDF Comparison

In [None]:
compare(rf_rtfidf, rf_rtfidf_tuned, test_rtfidf, 'Random Forest', 'Reduced TFIDF')

### Logistic Regression Unigram Comparison

In [None]:
compare(lr_uni, lr_uni_tuned, test_uni, 'Logistic Regression', 'Unigram')

### Logistic Regression Bigram Comparison

In [None]:
compare(lr_big, lr_big_tuned, test_big, 'Logistic Regression', 'Bigram')

### Logistic Regression Reduced Bigram Comparison

In [None]:
compare(lr_rbig, lr_rbig_tuned, test_rbig, 'Logistic Regression', 'Reduced Bigram')

### Naive Bayes Bigram Comparison

In [None]:
compare(nb_big, nb_big_tuned, test_big, 'Naive Bayes', 'Bigram')

# Save Tuned Models

In [None]:
path = os.getcwd()
folderpath = path + '/models' + '/tuned'
os.mkdir(folderpath)

In [None]:
def saveModel(model, name, test):
    filepath = folderpath + '/' + name + '.pkl'
    joblib.dump(model, filepath)
    
    # test to see if model correctly saved
    model_load = joblib.load(filepath)
    X = test[0]
    y = test[1]
    
    assert model.score(X, y) == model_load.score(X, y)

# Random Forest - TFIDF

In [None]:
saveModel(rf_tfidf_tuned, 'rf_tfidf', test_tfidf)

# Random Forest - Reduced TFIDF

In [None]:
saveModel(rf_rtfidf_tuned, 'rf_rtfidf', test_rtfidf)

# Logistic Regression - Unigram

In [None]:
saveModel(lr_uni_tuned, 'lr_uni', test_uni)

# Logistic Regression - Bigram

In [None]:
saveModel(lr_big_tuned, 'lr_big', test_big)

# Logistic Regression - Reduced Bigram

In [None]:
saveModel(lr_rbig_tuned, 'lr_rbig', test_rbig)

# Naive Bayes - Bigram

In [None]:
saveModel(nb_big_tuned, 'nb_big', test_big)