In [None]:
import util, scrape

import warnings
warnings.filterwarnings("ignore")

from tqdm.auto import tqdm
tqdm.pandas()

import collections
import itertools
import re
import pickle
import csv
import multiprocessing
import operator
import copy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import seaborn as sns

import gensim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE


In [None]:
# load trip reports dataframe
with open(util.TRIP_REPORTS_DATAFRAME_FILE, "rb") as f:
    df = pickle.load(f)
    
df

In [None]:
# label docs individually by enumeration
def label_docs(X):
    labeled = []
    for i, doc in enumerate(X):
        labeled.append(gensim.models.doc2vec.TaggedDocument(words=doc, tags=[f"DOC_{i}"]))
    return labeled


In [None]:
# create mask for dataframe columns to only include:
# (1) top N drugs with the most trip reports 
# (2) trip reports with at least this many tokens to ensure informative content

N = 10
MIN_TOKENS = 30 

mask_1 = [True if len(tokens) > MIN_TOKENS else False for tokens in df["trip_report_tokenized"]]

counter = collections.Counter(df["drug"][mask_1])
top_drugs = sorted(counter, key=lambda x: counter[x], reverse=True)[:N]
mask_2 = [True if df["drug"][i] in top_drugs else False for i, tokens in enumerate(df["trip_report_tokenized"])]

mask = np.logical_and(mask_1, mask_2)


In [None]:
def get_train_test_split_indices(y, train_size=0.8):
    indices = [i for i in range(len(y))]
    train_indices, test_indices, y_train, y_test = train_test_split(indices, y, train_size=train_size, stratify=y)
    
    return train_indices, test_indices
    
def split_X_into_train_test_sets(X, train_indices, test_indices):
    X_train = X[train_indices, :]
    X_test = X[test_indices, :]
        
    return X_train, X_test

def split_y_into_train_test_sets(y, train_indices, test_indices):
    y_train = [y[train_index] for train_index in train_indices]
    y_test = [y[test_index] for test_index in test_indices]
    
    return y_train, y_test
    

In [None]:
y = np.array(df["drug"][mask])
train_indices, test_indices = get_train_test_split_indices(y, train_size=0.8)
y_train, y_test = split_y_into_train_test_sets(y, train_indices, test_indices)


In [None]:
# train doc2vec model by num_epochs
def train_doc2vec_model(model, docs, num_epochs, lr_step, lr_min=0.0, verbose=True):
    for epoch in range(1, num_epochs+1):
        if verbose: print(f"Epoch: {epoch}")
        model.train(docs, total_examples=model.corpus_count, epochs=1)
        model.alpha -= lr_step  # decrease the learning rate
        model.min_alpha = lr_min
        

In [None]:
def grid_search_doc2vec_models(vector_sizes, windows, lrs_and_lr_steps, training_session_epochs, num_training_sessions, dm=0):
    # If dm=0, distributed bag of words (PV-DBOW) is used. 
    # If dm=1, ‘distributed memory’ (PV-DM) is used.
    
    num_hyperparam_configs = len(vector_sizes)*len(windows)*len(lrs_and_lr_steps)*num_training_sessions
    num_hyperparam_configs_tried = 0

    docs = label_docs(df["trip_report_tokenized"][mask])

    best_f_score_avg = 0.0
    best_model = None
    best_hyperparams = None
    best_report = None
    for vector_size, window, (lr, lr_step) in itertools.product(vector_sizes, windows, lrs_and_lr_steps):
        model = gensim.models.doc2vec.Doc2Vec(docs, workers=multiprocessing.cpu_count(), dm=dm, vector_size=vector_size, window=window, alpha=lr, dm_concat=1)
        epochs_trained = 0
        for i in range(num_training_sessions):
            train_doc2vec_model(model, docs, training_session_epochs, lr_step, verbose=False)
            epochs_trained += training_session_epochs

            # split X into train and test sets using established indices
            X = np.array([model.docvecs[i] for i in range(len(model.docvecs))])
            X_train, X_test = split_X_into_train_test_sets(X, train_indices, test_indices)

            # train a simple Gaussian naive Bayes classifier whose test set predictions' average f_score 
            # will be the measure for a given doc2vec model's quality
            clf = GaussianNB()
            clf.fit(X_train, y_train)

            # test classifier 
            labels = np.unique(y_train)
            report = util.test_classifier(clf, X_test, y_test, labels)
            f_score_avg = np.mean(report["f_score"])

            # print progress
            num_hyperparam_configs_tried += 1
            print(f"{num_hyperparam_configs_tried} / {num_hyperparam_configs} hyperparameter configurations tested: ({epochs_trained}, {vector_size}, {window}, {lr}, {lr_step})")

            # save best model
            if f_score_avg >= best_f_score_avg:
                best_f_score_avg = f_score_avg
                best_model = copy.deepcopy(model)
                best_hyperparams = (epochs_trained, vector_size, window, lr, lr_step)
                best_report = report
                print(f"New best f-score average: {f_score_avg}")

    print("\nDone")

    model = best_model
    f_score_avg = best_f_score_avg
    report = best_report
    epochs_trained, vector_size, window, lr, lr_step = best_hyperparams
    
    hyperparams = {
        "epochs_trained": epochs_trained,
        "vector_size": vector_size,
        "window": window,
        "lr": lr,
        "lr_step": lr_step
    }

    # print results
    print(f"\nBest model: ")
    print(f"\tF-score average: {f_score_avg}")
    if dm == 0:
        print("\tType of model: PV-DBOW")
    elif dm == 1:
        print("\tType of model: PV-DM")
    print(f"\tEpochs trained: {epochs_trained}")
    print(f"\tVector size: {vector_size}")
    print(f"\tWindow size: {window}")
    print(f"\tLearning rate: {lr}")
    print(f"\tLearning rate step decrease: {lr_step}")
    print(f"\tAverage F-score: {best_f_score_avg}")
    print("\tClassification report:\n")
    print(best_report)
    print("")
    
    return model, f_score_avg, report, hyperparams


In [None]:
vector_sizes = [50, 75, 100, 125, 150] 
windows = [3, 5, 7, 9] 
lrs_and_lr_steps = [
    (2.0e-1, 0.0),
    (2.0e-1, 2.0e-3),
    (1.0e-1, 0.0),
    (1.0e-1, 1.0e-3),
    (7.5e-2, 0.0),
    (7.5e-2, 7.5e-4),
    (5.0e-2, 0.0),
    (5.0e-2, 5.0e-4),
    (2.5e-2, 0.0),
    (2.5e-2, 2.5e-4),
]

training_session_epochs = 5
num_training_sessions = 5


In [None]:
# Find best PV-DBOW model 
model_dbow, f_score_avg_dbow, report_dbow, hyperparams_dbow = grid_search_doc2vec_models(vector_sizes, windows, lrs_and_lr_steps, training_session_epochs, num_training_sessions, dm=0)
    
# save PV-DM model
with open(util.DOC2VEC_MODEL_DBOW_FILE, "wb") as f:
    pickle.dump(model_dbow, f)
    
# save PV-DBOW hyperparameters
with open(util.DOC2VEC_HYPERPARAMETERS_DBOW_FILE, "wb") as f:
    pickle.dump(hyperparams_dbow, f)


In [None]:
# Find best PV-DM model 
model_dm, f_score_avg_dm, report_dm, hyperparams_dm = grid_search_doc2vec_models(vector_sizes, windows, lrs_and_lr_steps, training_session_epochs, num_training_sessions, dm=1)

# save PV-DM model
with open(util.DOC2VEC_MODEL_DM_FILE, "wb") as f:
    pickle.dump(model_dm, f)
    
# save PV-DM hyperparameters
with open(util.DOC2VEC_HYPERPARAMETERS_DM_FILE, "wb") as f:
    pickle.dump(hyperparams_dm, f)


In [None]:
# determine the which model type (PV-DBOW or PV-DM) is best for classification
model = None
if f_score_avg_dbow > f_score_avg_dm:
    model = model_dbow
    print("Best model type: PV-DBOW")
else:
    model = model_dm
    print("Best model type: PV-DM")
    

In [None]:
# Fit the model using t-SNE randomized algorithm
X_projected = TSNE(metric='cosine', random_state=0).fit_transform(model.docvecs.vectors_docs)


In [None]:
def scatter(X, drugs_to_plot=drugs):
    # make a custom color palette with seaborn
    gray_palette = [[0.8, 0.8, 0.8]]
    color_palette = sns.color_palette("hls", len(drugs_to_plot))
    palette = np.array(color_palette + gray_palette)
    sns.palplot(palette)
    
    # get colors for selected classes
    drug_to_rank_dict = {drug: i for i, drug in enumerate(drugs_to_plot)}
    color_indices = np.array([np.int(drug_to_rank_dict[drug]) if drug in drugs_to_plot else -1 for drug in y])
    color_index_to_drug_dict = {i: drug for i, drug in enumerate(drugs_to_plot)}

    # We create a scatter plot.
    f = plt.figure(figsize=(32, 32))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(X[:,0], X[:,1], lw=0, s=120,
                    c=palette[color_indices])
    #plt.xlim(-25, 25)
    #plt.ylim(-25, 25)
    ax.axis('off')
    ax.axis('tight')

    # We add the labels for each cluster.
    
    txts = []
    for i, drug in enumerate(drugs_to_plot):
        # Position of each label.
        xtext, ytext = np.median(X[color_indices == i, :], axis=0)
        txt = ax.text(xtext, ytext, color_index_to_drug_dict[i], fontsize=20)
        txt.set_path_effects([
            PathEffects.Stroke(linewidth=5, foreground="w"),
            PathEffects.Normal()])
        txts.append(txt)
    
    return f, ax, sc, txts


In [None]:
scatter(X_projected)


In [None]:
drugs = sorted(np.unique(y))


In [None]:
for drug in drugs:
    drugs_to_plot = [drug]
    scatter(X_projected, drugs_to_plot)
    