In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder
from helpers import splitting, get_Xy
import random
from helpers import create_ANN, create_CNN, results_NN, plot_res, results_rf, plot_res_rf, results_svc, results_gnb

# Data loading

In [None]:
X_ohe = pd.read_csv('TransformedData/X_ohe.csv')
X_w2v = pd.read_csv('TransformedData/X_w2v.csv')
X_d2v = pd.read_csv('TransformedData/X_d2v.csv')
X_tfidf = pd.read_csv('TransformedData/X_tfidf.csv')
X_bert = pd.read_csv('TransformedData/X_bert.csv')

y = pd.read_csv('TransformedData/y.csv')
y = pd.Series(y['condition_name'])

encoder = LabelEncoder()

In [None]:
train_ohe, test_ohe = splitting(X_ohe, y)

X_train_ohe, y_train = get_Xy(train_ohe)
X_test_ohe, y_test = get_Xy(test_ohe)

train_indices = train_ohe.index
test_indices = test_ohe.index

X_train_w2v = X_w2v.loc[train_indices]
X_test_w2v = X_w2v.loc[test_indices]

X_train_d2v = X_d2v.loc[train_indices]
X_test_d2v = X_d2v.loc[test_indices]

X_train_tfidf = X_tfidf.loc[train_indices]
X_test_tfidf = X_tfidf.loc[test_indices]

X_train_bert = X_bert.loc[train_indices]
X_test_bert = X_bert.loc[test_indices]

In [None]:
dataset = []
dataset.append([X_train_ohe, X_test_ohe, y_train, y_test])
dataset.append([X_train_w2v, X_test_w2v, y_train, y_test])
dataset.append([X_train_d2v, X_test_d2v, y_train, y_test])
dataset.append([X_train_tfidf, X_test_tfidf, y_train, y_test])
dataset.append([X_train_bert, X_test_bert, y_train, y_test])

In [None]:
#fix a seed for reproducibility
seed=0
np.random.seed(0)
random.seed(0)
#fix a score metric for f1
avg = 'weighted'

# Compare performance over dataset type

## Random Forest

Best hyperparameters for each dataset type

In [None]:
param_list = [
    {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 3, 'min_samples_leaf' :2, 'max_features':None, 'random_state' : seed, 'n_jobs' : 8}, #OHE
    {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 3, 'min_samples_leaf' :2, 'max_features':None, 'random_state' : seed, 'n_jobs' : 8}, #W2V
    {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 3, 'min_samples_leaf' :2, 'max_features':None, 'random_state' : seed, 'n_jobs' : 8}, #D2V
    {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 3, 'min_samples_leaf' :2, 'max_features':None, 'random_state' : seed, 'n_jobs' : 8}, #TFIDF
    {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 3, 'min_samples_leaf' :2, 'max_features':None, 'random_state' : seed, 'n_jobs' : 8} #BERT
]

In [None]:
results_list = results_rf(encoder, dataset, param_list)
plot_res_rf(results_list)

## Linear SVC

Best hyperparameters for each dataset type

In [None]:
param_list = [
    {'random_state' : seed, 'C' : 0.3593813663804626, 'loss' : 'hinge', 'max_iter' : 1000, 'tol' : 0.01}, #OHE
    {'random_state' : seed, 'C': 21.54434690031882, 'loss': 'hinge', 'tol': 0.01}, #W2V
    {'random_state' : seed, 'C' : 2.782559402207126, 'loss' : 'squared_hinge', 'max_iter' : 1000, 'tol' : 0.01}, #D2V
    {'random_state' : seed, 'C' : 21.54434690031882, 'loss' : 'squared_hinge', 'max_iter' : 1000, 'tol' : 0.01}, #TFIDF
    {'random_state' : seed, 'C' : 0.3593813663804626, 'loss' : 'squared_hinge', 'max_iter' : 1000, 'tol' : 0.01} #BERT
]

In [None]:
results_list, predictions_breast_cancer = results_svc(encoder, dataset, param_list)
plot_res(results_list)

## Gaussian NB

Best hyperparameters for each dataset type

In [None]:
param_list = [
    {'var_smoothing': 1.0}, #OHE
    {'var_smoothing': 0.0001873817422860383}, #W2V
    {'var_smoothing': 0.43287612810830584}, #D2V
    {'var_smoothing': 1.0}, #TFIDF
    {'var_smoothing': 0.15199110829529336} #BERT
]

In [None]:
results_list = results_gnb(encoder, dataset, param_list)
plot_res(results_list)

# Compare with a Neural Network

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [None]:
y_ohe = pd.get_dummies(y)
y_train_dum = y_ohe.loc[train_indices]
y_test_dum = y_ohe.loc[test_indices]

y_train_dum = tf.cast(y_train_dum,tf.float32).numpy()
y_test_dum = tf.cast(y_test_dum,tf.float32).numpy()

X_val_ohe, X_test_ohe, y_val_dum, y_test_dum = train_test_split(X_test_ohe, y_test_dum,test_size=0.5, random_state=seed)

val_indices = X_val_ohe.index
new_test_indices = X_test_ohe.index

X_val_w2v = X_test_w2v.loc[val_indices]
X_test_w2v = X_test_w2v.loc[new_test_indices]

X_val_d2v = X_test_d2v.loc[val_indices]
X_test_d2v = X_d2v.loc[new_test_indices]

X_val_tfidf = X_test_tfidf.loc[val_indices]
X_test_tfidf = X_test_tfidf.loc[new_test_indices]

X_val_bert = X_test_bert.loc[val_indices]
X_test_bert = X_test_bert.loc[new_test_indices]

In [None]:
dataset = []
dataset.append([X_train_ohe, X_val_ohe, X_test_ohe, y_train_dum, y_val_dum, y_test_dum])
dataset.append([X_train_w2v, X_val_w2v, X_test_w2v, y_train_dum, y_val_dum, y_test_dum])
dataset.append([X_train_d2v, X_val_d2v, X_test_d2v, y_train_dum, y_val_dum, y_test_dum])
dataset.append([X_train_tfidf, X_val_tfidf, X_test_tfidf, y_train_dum, y_val_dum, y_test_dum])
dataset.append([X_train_bert, X_val_bert, X_test_bert, y_train_dum, y_val_dum, y_test_dum])

In [None]:
tf.keras.utils.set_random_seed(
    42
)

## ANN

In [None]:
param_list = [
    {'num_classes' : y_train_dum.shape[1], 'hidden_layers_dims' : [64, 128], 'lr' : 0.001, 'reg' :  0.000001, 'dropout' : 0.5}, #OHE
    {'num_classes' : y_train_dum.shape[1], 'hidden_layers_dims' : [32, 64], 'lr' : 0.001, 'reg' :  0.001, 'dropout' : 0.4}, #W2V
    {'num_classes' : y_train_dum.shape[1], 'hidden_layers_dims' : [32, 64], 'lr' : 0.001, 'reg' :  0.001, 'dropout' : 0.4}, #D2V
    {'num_classes' : y_train_dum.shape[1], 'hidden_layers_dims' : [32, 64], 'lr' : 0.001, 'reg' :  0.001, 'dropout' : 0.3}, #TFIDF
    {'num_classes' : y_train_dum.shape[1], 'hidden_layers_dims' : [32, 64], 'lr' : 0.001, 'reg' :  0.001, 'dropout' : 0.3} #BERT
]

In [None]:
res = results_NN(dataset, param_list)
plot_res(res)

In [None]:
param_list = [
    {'num_classes' : y_train_dum.shape[1], 'lr' : 0.0001, 'reg' :  0.001, 'dropout' : 0.3, 'n_blocks' : 2}, #OHE
    {'num_classes' : y_train_dum.shape[1], 'lr' : 0.001, 'reg' :  0.001, 'dropout' : 0.3, 'n_blocks' : 1}, #W2V
    {'num_classes' : y_train_dum.shape[1], 'lr' : 0.001, 'reg' :  0.001, 'dropout' : 0.5, 'n_blocks' : 1}, #D2V
    {'num_classes' : y_train_dum.shape[1], 'lr' : 0.001, 'reg' :  0.001, 'dropout' : 0.5, 'n_blocks' : 1}, #TFIDF
    {'num_classes' : y_train_dum.shape[1], 'lr' : 0.0001, 'reg' :  0.001, 'dropout' : 0.4, 'n_blocks' : 1} #BERT
]

In [None]:
res=results_NN(dataset, param_list, epochs=70, cnn=True)
plot_res(res)