In [None]:
import util, scrape

import warnings
warnings.filterwarnings("ignore")

from tqdm.auto import tqdm
tqdm.pandas()

import collections
import itertools
import re
import pickle
import csv
import multiprocessing
import operator
import copy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import seaborn as sns

import gensim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE


In [None]:
# load best model for document embedding classification
with open(util.DOC2VEC_MODEL_DM_FILE, "rb") as f:
    model = pickle.load(f)
    

In [None]:
# run grid search on classifiers and their possible parameters and then return the best result
def grid_search_classifiers(X_train, X_test, y_train, y_test, clfs, clf_to_gs_params_dict, cv=5):
    reports = []
    f_score_avgs = []
    gs_list = []
    for clf in clfs:
        print(f"\tTraining classifier: {clf.__class__}...")
        gs = GridSearchCV(clf, clf_to_gs_params_dict[clf.__class__], cv=cv)
        gs.fit(X_train, y_train)
        gs_list.append(gs)
        labels = np.unique(y_train)
        report = util.test_classifier(gs, X_test, y_test, labels)
        reports.append(report)
        f_score_avg = np.mean(report["f_score"])
        f_score_avgs.append(f_score_avg)
        
    results_sorted = sorted(zip(gs_list, f_score_avgs, reports), key=lambda x: x[1], reverse=True)
    best_result = results_sorted[0]
            
    return best_result


In [None]:
clfs = [
    GaussianNB(),
    LogisticRegression(),
    KNeighborsClassifier(),
    MLPClassifier()
]

clf_to_gs_params_dict = {
    GaussianNB().__class__: {
    },
    LogisticRegression().__class__: {
        "penalty": ("l1", "l2"),
        "C": (1.0e0, 1.0e-1, 1.0e-2, 1.0e-3, 1.0e-4, 1.0e-5)
    },
    KNeighborsClassifier().__class__: {
        "n_neighbors": (4, 8, 12, 16)
    },
    MLPClassifier().__class__: {
        "hidden_layer_sizes": ((100,), (100, 50), (100, 50, 20)),
        "activation": ("relu",),
        "alpha": (1.0e-1, 1.0e-2, 1.0e-3),
        "batch_size": (8, 16, 64),
        "learning_rate": ("constant", "adaptive"),
        "learning_rate_init": (1.0e-2, 1.0e-3, 1.0e-4),
        "early_stopping": (True,),
        "momentum": (0.5, 0.9)
    }
}

# split X into train and test sets using established indices
X = np.array([model.docvecs[i] for i in range(len(model.docvecs))])
X_train, X_test = split_X_into_train_test_sets(X, train_indices, test_indices)

best_gs, best_f_score_avg, best_report = grid_search_classifiers(X_train, X_test, y_train, y_test, clfs, clf_to_gs_params_dict)
print(f"\tBest classifier: {best_gs.best_estimator_}")
print(f"\tAverage F-score: {best_f_score_avg}")
print(f"\tBest hyperparameters: {best_gs.best_params_}")
print("\tClassification report:\n")
print(best_report)
print("")


In [None]:
with open(util.CLASSIFIER_MODEL_FILE, "wb") as f: 
    pickle.dump(best_gs.best_estimator_, f)
    
with open(util.CLASSIFIER_HYPERPARAMETERS_FILE, "wb") as f:
    pickle.dump(best_gs.best_params_, f)
    

In [None]:
classifier_name = best_gs.best_estimator_.__class__.__name__
hyperparams = best_gs.best_params_

if len(hyperparams) > 0:
    clf_untrained = globals()[classifier_name](hyperparams)
else:
    clf_untrained = globals()[classifier_name]()

results = util.train_and_test_classifier_k_fold(X, y, clf_untrained, k_fold=10)
for i, (report, f_score_avg) in enumerate(results):
    print(f"k: {i+1}/{len(results)}")
    print(f"F-Score Average: {f_score_avg}")
    print(report)
    print("")
    