# Domain keywords - feature set


1. load the test set
2. Load the pickle-object
3. Hyperparameter tuning 
    1. Generate the initial training split
    2. Classical 
        - Support Vector machine (SVM)
        - Logistic Regression (LR)
        - Random Forest (RF)
    3. Shallow deep learning models
        - Tuned using optuna
4. Model evaluation
    1. Classical
    2. Shallow deep learning models
        - Priorly tuned model
        - Optuna hyperparameter set

# 0.5 Imports and functions

In [13]:
# imports
import pickle
import json
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
import matplotlib.pyplot as plt

# plotting AUC curve
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

# keras/tensorflow
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import regularizers
import tensorflow_addons as tfa
from keras import backend as K


# skearln
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, roc_auc_score, f1_score, make_scorer
from sklearn.utils import shuffle

# ml models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import svm, tree
from sklearn import model_selection

# optimizing models
import optuna
# visualizing optuna loss
from optuna.visualization import plot_optimization_history

# spaCy
import spacy
import en_core_web_lg
import en_core_web_trf
import en_core_web_sm

# plotting AUC curve
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

# scikeras
from scikeras.wrappers import KerasClassifier

# stats
from scipy.stats import sem

# saving
import joblib

In [3]:
# functions

def average_loss(cv_results):

    loss_list = []; val_loss_list = []

    for i in cv_results['estimator']:

        loss_list.append(i.history_['loss'])
        val_loss_list.append(i.history_['val_loss'])

    loss_array = np.array(loss_list); val_loss_array = np.array(val_loss_list)
    average_loss = loss_array.mean(axis=0); average_val_loss = val_loss_array.mean(axis=0)

    plt.plot(average_loss); plt.plot(average_val_loss)
    plt.ylim([0.0, 50])
    plt.show()


def extract_hyper_results(df, param1, param2):
    
    slice1 = df.loc[ ('set1', param1, param2),:]
    #slice2 = df.loc[('set2',   param1, param2),:]
    #slice3 = df.loc[('set3',   param1, param2),:]
    #slice4 = df.loc[('set4',   param1, param2),:]
    #slice5 = df.loc[('set5',   param1, param2),:]

    return pd.DataFrame(data=[slice1]).round(decimals=3)


def strip_html(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

def remove_special_characters(text):
    pattern=r'[^a-zA-z0-9\s]'
    return re.sub(pattern,'',text) 

def remove_elsevier_stuff(text):
    # attmepting to remove stuff journals add to all abstracts, is too many different stuff, and I couldn't find a common regex expression to remove them all. 
    pattern = r'\©'
    pattern1 = r'\©\s\d{1,4}\sElsevier\sLtd'
    pattern2 = r'\©\s\d{1,4}\sElsevier\sB\.V\.'
    pattern3 = r'\©\s\d{0,4}\sElsevier\s[\.A-Za-z]*\sAll\srights\sreserved.'
    pattern4 = r'\©\s\d{1,4}\sMoscoviz\set\sal\.Background\:\s'
    pattern5 = r'\©\s\d{1,4}\,* Akadémiai Kiadó, Budapest, Hungary.'
    pattern6 = r'\©\s\d{1,4}\sAmerican Institute of Chemical Engineers[Chemical Engineers\sBiotechnol\. Prog\.\,\d{1,3}\:\d{1,3}\–\d{1,3}\, \d{1,4}\.]*'
    pattern7 = r'^\©\s\d{1,4}\s*Wiley-VCH Verlag GmbH & Co\. KGaA, Weinheim'
    pattern8 = r'\©\s\d{1,4}\sThe Authors'
    pattern9 = r'\©\s\d{1,4}\sWILEY-VCH Verlag GmbH & Co\. KGaA\.'
    pattern10 = r'\©\s\d{1,4}\sAOCS'
    pattern11 = r'\©\s\d{1,4}\sTaylor \& Francis Group\, LLC\.'
    
    return re.sub(pattern, '', text)


def remove_punct_and_short_words(text):

    not_important_pos = ['PUNCT', 'NUM']

    
    doc = nlp(text)
    new_token_list = []
    for token in doc:
        #print(token, token.pos_, token.dep_)
        if not token.pos_ in not_important_pos:
            if len(token.text) > 2:
                new_token_list.append(token.text)  
            
    return ' '.join(new_token_list)

import spacy #load spacy

not_important_pos = ['PUNCT', 'NUM']
nlp = en_core_web_lg.load()
stops = nlp.Defaults.stop_words
#stops = stopwords.words("english")
def remove_copyright_and_year(text):
    
    pattern = r'©\s*\d{,4}'
    return re.sub(pattern, '', text)

def normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = remove_copyright_and_year(comment)
    comment = nlp(comment)

    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):

                lemmatized.append(lemma)
                
    return ' '.join(lemmatized)


#Data['Text_After_Clean'] = Data['Text'].apply(normalize, lowercase=True, remove_stopwords=True)


def run_exps(X_train: pd.DataFrame ,
             y_train: pd.DataFrame,
             X_test: pd.DataFrame,
             y_test: pd.DataFrame, X, y) -> pd.DataFrame:
    '''
    Lightweight script to test many models and find winners:param X_train: training split
    :param y_train: training target vector
    :param X_test: test split
    :param y_test: test target vector
    :return: DataFrame of predictions
    '''
    
    dfs = []
    
    models = [
          ('LogReg', LogisticRegression(max_iter=300)), 
          ('RF', RandomForestClassifier()),
          #('KNN', KNeighborsClassifier()),
          ('SVM', SVC()),

        ]
    results = []
    names = []
    
    scoring = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'mcc': 'matthews_corrcoef',
           'auc': 'roc_auc',
           'f1_pos': 'f1',
           'f1_neg': make_scorer(f1_score, pos_label=0, average='binary')
          }
    
    #scoring = ['accuracy', 'recall', 'precision', 'f1']
    target_names = ['Relevant', 'Irrelevant']
    
    result_dict = {}
    
    for name, model in models:
        kfold = model_selection.RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=1)
        cv_results = model_selection.cross_validate(model, X, y, cv=kfold, scoring=scoring, return_train_score=True)
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        train_pred = clf.predict(X_train)
        print(name)
        res = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
        
        print('train')
        print(classification_report(y_train, train_pred, target_names=target_names))
        print('MCC', matthews_corrcoef(y_train, train_pred))
        print('AUC', roc_auc_score(y_train, train_pred))
        print('test')
        print(classification_report(y_test, y_pred, target_names=target_names))
        print('MCC', matthews_corrcoef(y_test, y_pred))
        print('AUC', roc_auc_score(y_test, y_pred))

        
        result_dict[name] = res
        
        results.append(cv_results)
        names.append(name)
        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)
        
        final = pd.concat(dfs, ignore_index=True)
    
    
    
    return final, result_dict

def ann_metrics(model, features, labels):
    
    y_pred = model.predict(features.astype('float64'), batch_size=1, verbose=0)
    #print(y_pred)
    
    y_pred_bool_manual = []
    
    for i in y_pred:
        if i>0.3:
            y_pred_bool_manual.append(1)
        else:
            y_pred_bool_manual.append(0)
    #print(y_pred_bool_manual)
    target_names = ['Irrelevant', 'Relevant']
    res = classification_report(labels, y_pred_bool_manual, target_names=target_names, output_dict=True)
    
    print(classification_report(labels, y_pred_bool_manual, target_names=target_names, zero_division=1))
    
def mcc_metric(y_true, y_pred):
    predicted = tf.cast(tf.greater(y_pred, threshold), tf.float32)
    true_pos = tf.math.count_nonzero(predicted * y_true)
    true_neg = tf.math.count_nonzero((predicted - 1) * (y_true - 1))
    false_pos = tf.math.count_nonzero(predicted * (y_true - 1))
    false_neg = tf.math.count_nonzero((predicted - 1) * y_true)
    x = tf.cast((true_pos + false_pos) * (true_pos + false_neg) 
      * (true_neg + false_pos) * (true_neg + false_neg), tf.float32)
    
    hei = tf.cast((true_pos * true_neg) - (false_pos * false_neg), tf.float32) / tf.sqrt(x)
    print(hei)
    return hei

def mcc_threshold(threshold):
    def mcc_metric2(y_true, y_pred):
        y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), threshold), K.floatx())

        tp = K.round(K.sum(K.clip(y_true * y_pred, 0, 1)))
        tn = K.round(K.sum(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
        fp = K.round(K.sum(K.clip((1 - y_true) * y_pred, 0, 1)))
        fn = K.round(K.sum(K.clip(y_true * (1 - y_pred), 0, 1)))

        num = tp * tn - fp * fn
        den = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
        return num / K.sqrt(den + K.epsilon())
    return mcc_metric2


def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.ylim([0, 2])
    plt.xlabel('Epoch')
    plt.ylabel('Error [relevant]')
    plt.legend()
    plt.grid(True)

def plot_auc_curve(x_train, x_test, y_train, y_test):

    y_test_pred = model_8.predict(x_test.toarray())
    y_train_pred = model_8.predict(x_train.toarray())

    fpr_test, tpr_test, thresholds_keras = roc_curve(y_test, y_test_pred)
    fpr_train, tpr_train, thresholds_keras = roc_curve(y_train, y_train_pred)

    auc_test = auc(fpr_test, tpr_test)
    auc_train = auc(fpr_train, tpr_train)

    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_test, tpr_test, label='train (area = {:.3f})'.format(auc_test))
    plt.plot(fpr_train, tpr_train, label='test (area = {:.3f})'.format(auc_train))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    #plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()
    
    
def scale_mcc(mcc):
    return ((mcc-(-1))/(1-(-1)))*(1-0)+0


def custom_f1_pos(y_true, y_pred):
    def recall_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Positives = K.sum(K.round(K.clip(y_true, 0, 1)))

        recall = TP / (Positives+K.epsilon())
        return recall


    def precision_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))

        precision = TP / (Pred_Positives+K.epsilon())
        return precision

    precision, recall = precision_m(y_true, y_pred), recall_m(y_true, y_pred)

    return 2*((precision*recall)/(precision+recall+K.epsilon()))


def custom_f1_neg(y_true, y_pred):

    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tn / (tn + fn + K.epsilon())
    r = tn / (tn + fp + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 2*((p*r)/(p+r+K.epsilon()))

def calculate_f1_score(y, pred, threshold, pos_label):
    # if this also wrong, I compute myself
    y_pred = pred.copy()

    y_pred[y_pred > threshold] = 1
    y_pred[y_pred < threshold] = 0

    return f1_score(y, y_pred, pos_label=pos_label)

# 1 load the test set

In [6]:
test_set = pd.read_json('data/new_test_set_II.json')
test = pd.read_json('data/validation_set.json')

test_set1 = test_set.rename(columns={'DOI': 'doi', 'Abstract': 'abstract', 'Label':'label'})

full_set = pd.concat([test_set1, test]).reset_index(drop=True)

full_set

Unnamed: 0,doi,abstract,label
0,10.1016/j.cattod.2019.08.027,© 2019 Elsevier B.V.Sodium titanate nanotubes ...,0
1,10.1016/j.fuel.2021.120653,© 2021 Elsevier LtdThis research investigated ...,1
2,10.1016/j.fuel.2014.07.035,This research reports on the synthesis of meth...,1
3,10.1016/j.enconman.2017.03.044,© 2017 Elsevier LtdAn optimization of methyl e...,1
4,10.1016/j.supflu.2011.11.012,This work reports phase equilibrium measuremen...,1
...,...,...,...
495,10.1016/j.fuproc.2010.05.032,"In this study, biodiesel was produced from Mor...",1
496,10.1016/j.combustflame.2013.09.013,The oxidation characteristics of several small...,0
497,10.1016/j.algal.2015.09.006,The feasibility of microalgae based biodiesel ...,0
498,10.1016/j.biortech.2016.05.020,Mixotrophic growth of microalgae to boost lipi...,0


In [4]:
training1 = pd.read_json('data/new_test_set_II.json')
training2 = pd.read_json('data/validation_set.json')
training1 = training1.rename(columns={'DOI':'doi', 'Abstract': 'abstract', 'Label':'label'})

train = pd.concat([training1, training2]).reset_index(drop=True)

# 2 Load the pickle object

generated with CountVectorizer in Scikit-learn

In [5]:
vec = pickle.load(open('bow_features/vector.pickel', 'rb'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
cleaned_df = train.copy()
cleaned_df['abstract'] = cleaned_df['abstract'].apply(denoise_text)
cleaned_df['abstract'] = cleaned_df['abstract'].apply(remove_special_characters)
cleaned_df['abstract'] = cleaned_df['abstract'].apply(normalize, lowercase=True, remove_stopwords=True)
cleaned_df['abstract'] = cleaned_df['abstract'].apply(remove_punct_and_short_words)

In [9]:
cleaned_df

Unnamed: 0,doi,abstract,label
0,10.1016/j.cattod.2019.08.027,elsevier bvsodium titanate nanotubes stn modif...,0
1,10.1016/j.fuel.2021.120653,elsevier ltdthis research investigate water in...,1
2,10.1016/j.fuel.2014.07.035,research report synthesis methylic ethylic bio...,1
3,10.1016/j.enconman.2017.03.044,elsevier ltdan optimization methyl ester synth...,1
4,10.1016/j.supflu.2011.11.012,work report phase equilibrium measurement tern...,1
...,...,...,...
495,10.1016/j.fuproc.2010.05.032,study biodiesel produce moringa oleifera oil u...,1
496,10.1016/j.combustflame.2013.09.013,oxidation characteristic small methyl ethyl es...,0
497,10.1016/j.algal.2015.09.006,feasibility microalgae base biodiesel depend s...,0
498,10.1016/j.biortech.2016.05.020,mixotrophic growth microalgae boost lipid prod...,0


# 3 Hyperparameter tuning


## 3.1 Classical models

### 3.1.1 SVM

In [15]:
# select model and define the hyperparameter range(s)
# SVM

C = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
gamma = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]

svm_dict = {}; index = 0

# divide into X and y
x = cleaned_df['abstract'].values
y = cleaned_df['label'].values

# convert X into the features
X = vec.transform(x).toarray()

res_list = []

for c in C:

    for g in gamma:
        res_dict = {'C':float, 'gamma': float, 'metric': {}}
        single_res = {'train_accuracy': float, 'test_accuracy': float, 'train_recall': float, 'test_recall': float, 'train_precison': float,
                        'test_precison': float,'train_auc': float, 'test_auc': float, 'train_f1_pos':float, 'test_f1_pos': float, 'train_f1_neg': float, 'test_f1_neg': float}


        clf = SVC(C=c, gamma=g)

        # RepeatedStratifiedKfold

        kfold = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)

        scoring = {'accuracy': 'accuracy',
                    'recall': 'recall',
                    'precison': 'precision',
                    'auc': 'roc_auc',
                    'f1_pos': 'f1',
                    'f1_neg': make_scorer(f1_score, pos_label=0, average='binary')
                    }  

        cv_results = model_selection.cross_validate(clf, X, y, cv=kfold, scoring=scoring, return_train_score=True)

        for metric, values in cv_results.items():

            single_res[metric] = (np.round(values.mean(), decimals=3), np.round(values.std(), decimals=3))
            

        
        res_dict['C'] = c
        res_dict['gamma'] = g
        res_dict['metric'] = single_res

        res_list.append(res_dict)

svm_dict['set' + str(index + 1)] = res_list

hyper_list = []
data_list = []

for j, res in svm_dict.items():
    
    for i in res:
        
        hyper_list.append((j, i['C'], i['gamma']))

        data_list.append(i['metric'])

index = pd.MultiIndex.from_tuples(hyper_list, names=['train/valset', 'C', 'gamma'])
svm_df = pd.DataFrame(data_list, index=index)

In [27]:
extract_hyper_results(svm_df, 100, 0.00001)

Unnamed: 0,Unnamed: 1,Unnamed: 2,train_accuracy,test_accuracy,train_recall,test_recall,train_mcc,test_mcc,train_auc,test_auc,train_f1_pos,test_f1_pos,train_f1_neg,test_f1_neg,fit_time,score_time
set1,100.0,1e-05,"(0.904, 0.006)","(0.859, 0.032)","(0.896, 0.011)","(0.842, 0.049)","(0.904, 0.006)","(0.86, 0.032)","(0.96, 0.004)","(0.924, 0.025)","(0.902, 0.006)","(0.854, 0.034)","(0.907, 0.005)","(0.863, 0.031)","(0.059, 0.008)","(0.065, 0.008)"


In [53]:
extract_hyper_results(svm_df, 100, 0.00001)

Unnamed: 0,Unnamed: 1,Unnamed: 2,train_accuracy,test_accuracy,train_recall,test_recall,train_mcc,test_mcc,train_auc,test_auc,train_f1_pos,test_f1_pos,train_f1_neg,test_f1_neg,fit_time,score_time
set1,100.0,1e-05,"(0.886, 0.01)","(0.836, 0.037)","(0.882, 0.02)","(0.841, 0.048)","(0.886, 0.01)","(0.836, 0.038)","(0.952, 0.006)","(0.904, 0.03)","(0.885, 0.011)","(0.836, 0.038)","(0.887, 0.009)","(0.835, 0.038)","(0.024, 0.009)","(0.037, 0.009)"
set2,100.0,1e-05,"(0.92, 0.007)","(0.879, 0.026)","(0.896, 0.012)","(0.856, 0.045)","(0.92, 0.007)","(0.88, 0.026)","(0.968, 0.004)","(0.938, 0.02)","(0.916, 0.008)","(0.874, 0.027)","(0.923, 0.007)","(0.883, 0.025)","(0.019, 0.008)","(0.031, 0.006)"
set3,100.0,1e-05,"(0.903, 0.007)","(0.859, 0.031)","(0.886, 0.015)","(0.836, 0.055)","(0.903, 0.007)","(0.861, 0.031)","(0.957, 0.006)","(0.918, 0.027)","(0.901, 0.008)","(0.855, 0.034)","(0.905, 0.006)","(0.863, 0.03)","(0.05, 0.012)","(0.072, 0.012)"
set4,100.0,1e-05,"(0.893, 0.008)","(0.845, 0.029)","(0.869, 0.016)","(0.81, 0.053)","(0.893, 0.008)","(0.845, 0.028)","(0.956, 0.005)","(0.911, 0.026)","(0.885, 0.009)","(0.83, 0.033)","(0.901, 0.007)","(0.857, 0.026)","(0.029, 0.009)","(0.045, 0.017)"
set5,100.0,1e-05,"(0.928, 0.007)","(0.884, 0.026)","(0.933, 0.01)","(0.887, 0.046)","(0.928, 0.007)","(0.885, 0.026)","(0.978, 0.004)","(0.95, 0.016)","(0.928, 0.007)","(0.883, 0.026)","(0.928, 0.007)","(0.884, 0.027)","(0.024, 0.005)","(0.04, 0.009)"


### 3.1.2 LR

In [68]:
# select model and define the hyperparameter range(s)
# logistic regression

C = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000] # [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'newton-cholesky', 'sag', 'saga'] # ['newton-cg', 'lbfgs', 'liblinear', 'newton-cholesky', 'sag', 'saga']
lr_dict = {}

# divide into X and y
x = cleaned_df['abstract'].values
y = cleaned_df['label'].values

# convert X into the features
X = vec.transform(x).toarray()
res_list = []

for c in C:
    
    for solver in solvers:
        
        res_dict = {'C':float, 'solver': str, 'metric': {}}
        single_res = {'train_accuracy': float, 'test_accuracy': float, 'train_recall': float, 'test_recall': float, 'train_precison': float,
                        'test_precison': float,'train_auc': float, 'test_auc': float, 'train_f1_pos':float, 'test_f1_pos': float, 'train_f1_neg': float, 'test_f1_neg': float}

        clf = LogisticRegression(C=c, solver=solver, max_iter=300)

        # RepeatedStratifiedKfold

        kfold = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)

        scoring = {'accuracy': 'accuracy',
                    'recall': 'recall',
                    'precison': 'precision',
                    'auc': 'roc_auc',
                    'f1_pos': 'f1',
                    'f1_neg': make_scorer(f1_score, pos_label=0, average='binary')
                    }  

        cv_results = model_selection.cross_validate(clf, X, y, cv=kfold, scoring=scoring, return_train_score=True)

        for metric, values in cv_results.items():
            single_res[metric] = (np.round(values.mean(), decimals=3), np.round(values.std(), decimals=3))


        res_dict['C'] = c
        res_dict['solver'] = solver
        res_dict['metric'] = single_res

        res_list.append(res_dict)

lr_dict['set1'] = res_list
    
hyper_list = []
data_list = []

for j, res in lr_dict.items():
    
    for i in res:
        
        hyper_list.append((i['C'], i['solver']))

        data_list.append(i['metric'])
        
index = pd.MultiIndex.from_tuples(hyper_list, names=['C', 'solver'])
lr_df = pd.DataFrame(data_list, index=index)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [69]:
lr_df.loc[ (0.001, 'lbfgs'),:]

train_accuracy    (0.882, 0.008)
test_accuracy     (0.858, 0.032)
train_recall      (0.853, 0.012)
test_recall       (0.831, 0.055)
train_precison    (0.901, 0.009)
test_precison     (0.876, 0.048)
train_auc         (0.941, 0.005)
test_auc          (0.922, 0.026)
train_f1_pos      (0.876, 0.009)
test_f1_pos       (0.851, 0.035)
train_f1_neg      (0.887, 0.008)
test_f1_neg       (0.864, 0.031)
fit_time          (0.018, 0.003)
score_time        (0.011, 0.002)
Name: (0.001, lbfgs), dtype: object

### 3.1.3 RF

In [70]:
# select model and define the hyperparameter range(s)
# random forest

N = [1,2,3,4,5,6,7,8,9] + [100] + [1000] + [10000]
criterions = ['gini', 'entropy', 'log_loss']

rf_dict = {}
    
# divide into X and y
x = cleaned_df['abstract'].values
y = cleaned_df['label'].values

# convert X into the features
X = vec.transform(x).toarray()
res_list = []

for n in N:
    
    for crit in criterions:
        
        res_dict = {'C':float, 'criterion': str, 'metric': {}}
        single_res = {'train_accuracy': float, 'test_accuracy': float, 'train_recall': float, 'test_recall': float, 'train_precison': float,
                        'test_precison': float,'train_auc': float, 'test_auc': float, 'train_f1_pos':float, 'test_f1_pos': float, 'train_f1_neg': float, 'test_f1_neg': float}

        

        clf = RandomForestClassifier(n_estimators=n, criterion=crit)

        # RepeatedStratifiedKfold

        kfold = model_selection.RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=1)

        scoring = {'accuracy': 'accuracy',
                    'recall': 'recall',
                    'precison': 'precision',
                    'auc': 'roc_auc',
                    'f1_pos': 'f1',
                    'f1_neg': make_scorer(f1_score, pos_label=0, average='binary')
                    }  

        cv_results = model_selection.cross_validate(clf, X, y, cv=kfold, scoring=scoring, return_train_score=True)

        for metric, values in cv_results.items():
            single_res[metric] = (np.round(values.mean(), decimals=3), np.round(values.std(), decimals=3))


        res_dict['N'] = n
        res_dict['criterion'] = crit
        res_dict['metric'] = single_res

        res_list.append(res_dict)

rf_dict['set1'] = res_list
    
hyper_list = []
data_list = []

for j, res in rf_dict.items():
    
    for i in res:
        
        hyper_list.append((i['N'], i['criterion']))

        data_list.append(i['metric'])

index = pd.MultiIndex.from_tuples(hyper_list, names=['n-estimators', 'solver'])
rf_df = pd.DataFrame(data_list, index=index)

In [71]:
rf_df.loc[(10000, 'entropy')]

train_accuracy         (1.0, 0.0)
test_accuracy      (0.879, 0.031)
train_recall           (1.0, 0.0)
test_recall        (0.877, 0.047)
train_precison         (1.0, 0.0)
test_precison       (0.878, 0.04)
train_auc              (1.0, 0.0)
test_auc            (0.945, 0.02)
train_f1_pos           (1.0, 0.0)
test_f1_pos        (0.876, 0.033)
train_f1_neg           (1.0, 0.0)
test_f1_neg        (0.881, 0.031)
fit_time          (27.141, 0.301)
score_time         (2.577, 0.213)
Name: (10000, entropy), dtype: object

## 3.3 Shallow neural net

### 3.3.1 Tuning with optuna

In [34]:
vec.transform(cleaned_df['abstract'].sample(frac=0.5))

<250x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 15934 stored elements in Compressed Sparse Row format>

In [7]:
five_fold = []

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=1)

for train_index, test_index in kf.split(train):
    t = train.loc[train_index, :]
    v = train.loc[test_index, :]

    five_fold.append((t,v))

In [8]:
def create_ann_model(trial):
    
    n_hidden = trial.suggest_int('n_hidden', 1, 10)
    n_units = trial.suggest_int('n_units', 1, 64)
    
    #hidden_activation = trial.suggest_categorical('activation', ['elu', 'relu'])
    k_regularizer = trial.suggest_float('kernel_regularizer', 1e-5, 1e-0, log=True)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-0, log=True)
    
    dropout = trial.suggest_float('dropout', 0.0, 0.9)
    threshold = trial.suggest_float('threshold', 0.1, 0.9)
    
    
    
    
    model = keras.Sequential()
    model.add(keras.layers.InputLayer(input_shape = (1000,)))

    for i in range(n_hidden):
        
        model.add(keras.layers.Dense(n_units, activation='relu', kernel_regularizer=regularizers.L1(k_regularizer)))

    model.add(keras.layers.Dropout(dropout))

    model.add(keras.layers.Dense(1, activation='sigmoid'))
        
                             
    
    model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=False
                                                   ),
                 optimizer = keras.optimizers.Adam(learning_rate=learning_rate),
                 metrics=['AUC', keras.metrics.BinaryAccuracy(threshold=threshold),
                                 keras.metrics.Recall(thresholds=threshold),
                                 keras.metrics.Precision(thresholds=threshold)])
    
    return model
    
    
def objective(trial):
    
    loss_list = []; auc_list = []; acc_list = []; rec_list = []; prec_list = []
    
    for t, v in five_fold:
        
        # divide into X and y and convert the text into features
        X_train = vec.transform(t['abstract'].values).toarray(); y_train = t['label'].values
        X_test = vec.transform(v['abstract'].values).toarray(); y_test = v['label'].values
        
        model = create_ann_model(trial)
    
        model.fit(X_train, y_train, validation_split=0.25, verbose=0, epochs=100, batch_size=100)
    
        score = model.evaluate(X_test, y_test, verbose=0)

        loss_list.append(score[0])
        auc_list.append(score[1])
        acc_list.append(score[2])
        rec_list.append(score[3])
        prec_list.append(score[4])
    
    return np.array(loss_list).mean(), np.array(auc_list).mean(), np.array(acc_list).mean(), np.array(rec_list).mean()


study = optuna.create_study(study_name='BoW-features',
                            directions=['minimize', 'maximize', 'maximize', 'maximize'],
                            sampler=optuna.samplers.NSGAIISampler(), 
                            pruner=optuna.pruners.MedianPruner())

study.optimize(objective, n_trials=250, n_jobs=1)

[32m[I 2023-09-10 22:44:01,394][0m A new study created in memory with name: BoW-features[0m
[32m[I 2023-09-10 22:44:39,268][0m Trial 0 finished with values: [3.7045527935028075, 0.5954364538192749, 0.5999999940395355, 0.9714285731315613] and parameters: {'n_hidden': 3, 'n_units': 42, 'kernel_regularizer': 3.252320601075059e-05, 'learning_rate': 0.4356979856277655, 'dropout': 0.21382376483484197, 'threshold': 0.37404947489402296}. [0m
[32m[I 2023-09-10 22:45:22,882][0m Trial 1 finished with values: [23.062834167480467, 0.5641550183296203, 0.4900000035762787, 1.0] and parameters: {'n_hidden': 8, 'n_units': 9, 'kernel_regularizer': 0.04752892339894869, 'learning_rate': 1.4988822340920365e-05, 'dropout': 0.048525896101412724, 'threshold': 0.27265874826006264}. [0m
[32m[I 2023-09-10 22:46:04,789][0m Trial 2 finished with values: [0.9944956064224243, 0.8584943056106568, 0.7419999957084655, 0.8596825480461121] and parameters: {'n_hidden': 7, 'n_units': 6, 'kernel_regularizer': 5.52

In [10]:
len(study.trials)

250

In [14]:
bow_df = study.trials_dataframe()
bow_df = bow_df.rename(columns=({'values_0': 'accuracy', 'values_1': 'recall', 'values_2': 'f1', 'values_3': 'precison'}))
bow_df.to_json('data/hyperparameter_search/bag_of_words/BoW_ann.json')

joblib.dump(study, 'data/hyperparameter_search/bag_of_words/BoW_ann.pkl')

['data/hyperparameter_search/bag_of_words/BoW_ann.pkl']

In [351]:
for i in study.best_trials:
    print('model#: ', i.number)
    #print('performance: ', i.values)
    print('parameter_set: ', i.params)

model#:  1
parameter_set:  {'n_hidden': 9, 'n_units': 5, 'kernel_regularizer': 0.00035721607432455795, 'learning_rate': 0.00014802897107670955, 'dropout': 0.23753685374568326, 'threshold': 0.501941517418462}
model#:  2
parameter_set:  {'n_hidden': 4, 'n_units': 39, 'kernel_regularizer': 0.010435986651078987, 'learning_rate': 0.0017945330854873274, 'dropout': 0.420905983458333, 'threshold': 0.9223544331901787}
model#:  7
parameter_set:  {'n_hidden': 5, 'n_units': 40, 'kernel_regularizer': 0.0009997092744088631, 'learning_rate': 0.0003593830342512407, 'dropout': 0.3001342570565156, 'threshold': 0.8043402703667024}


In [42]:
for i in study.best_trials:
    print('model#: ', i.number)
    print('performance: ', i.values)
    print('parameter_set: ', i.params)

model#:  0
performance:  [1.1136011362075806, 0.6867601633071899, 0.5020000040531158, 1.0]
parameter_set:  {'n_hidden': 7, 'n_units': 62, 'kernel_regularizer': 9.705524802098772e-05, 'learning_rate': 0.00022315887005935426, 'dropout': 0.5356312187232736, 'threshold': 0.30636787388249}
model#:  1
performance:  [0.49386538863182067, 0.9132558941841126, 0.7860000133514404, 0.60801260471344]
parameter_set:  {'n_hidden': 1, 'n_units': 52, 'kernel_regularizer': 2.0561657151544554e-05, 'learning_rate': 0.0004334924000007298, 'dropout': 0.13257784552647578, 'threshold': 0.964929234427516}
model#:  2
performance:  [0.9860644698143005, 0.9181377530097962, 0.8180000066757203, 0.9044602036476135]
parameter_set:  {'n_hidden': 3, 'n_units': 39, 'kernel_regularizer': 0.00044140545887066046, 'learning_rate': 0.00014506328714941635, 'dropout': 0.27487045059901277, 'threshold': 0.2711898939392464}
model#:  6
performance:  [0.8480913639068604, 0.7416256546974183, 0.5960000157356262, 0.987744677066803]
pa

In [71]:
for i in study.best_trials:
    #print('model#: ', i.number)
    #print('performance: ', i.values)
    print('parameter_set: ', i.params)

parameter_set:  {'n_hidden': 1, 'n_units': 57, 'activation': 'relu', 'kernel_regularizer': 1.358415564119472e-05, 'learning_rate': 0.0003863053283527932, 'dropout': 0.4202520981765924, 'threshold': 0.7665980174820468}
parameter_set:  {'n_hidden': 5, 'n_units': 38, 'activation': 'relu', 'kernel_regularizer': 1.358415564119472e-05, 'learning_rate': 0.0005411428551177511, 'dropout': 0.052179566680561985, 'threshold': 0.5437192158028702}
parameter_set:  {'n_hidden': 1, 'n_units': 22, 'activation': 'relu', 'kernel_regularizer': 1.358415564119472e-05, 'learning_rate': 0.0003863053283527932, 'dropout': 0.052179566680561985, 'threshold': 0.7665980174820468}
parameter_set:  {'n_hidden': 2, 'n_units': 57, 'activation': 'relu', 'kernel_regularizer': 1.358415564119472e-05, 'learning_rate': 0.00014624928787350703, 'dropout': 0.2951334265681061, 'threshold': 0.27264665452348325}
parameter_set:  {'n_hidden': 1, 'n_units': 38, 'activation': 'relu', 'kernel_regularizer': 1.358415564119472e-05, 'learnin

# 4 Model evaluation

## Generate the training/test splits

In [20]:
# import the unseen test for final testing of the models
test = pd.read_json('data/unseen_test.json')

if 'doi' in test.columns:
    test = test.drop(columns='doi')

test = test.copy()
test['abstract'] = test['abstract'].apply(denoise_text)
test['abstract'] = test['abstract'].apply(remove_special_characters)
test['abstract'] = test['abstract'].apply(normalize, lowercase=True, remove_stopwords=True)
test['abstract'] = test['abstract'].apply(remove_punct_and_short_words)

train = cleaned_df.copy()

In [125]:
# splitting into chunks
training_chunk1 = cleaned_df[:100]
training_chunk2 = cleaned_df[100:200]
training_chunk3 = cleaned_df[200:300]
training_chunk4 = cleaned_df[300:400]
training_chunk5 = cleaned_df[400:]

# create the five different training sets                                                                             # excluded
set1 = pd.concat([training_chunk1,training_chunk2,training_chunk3,training_chunk4]).reset_index(drop=True)            # training_chunk5
set2 = pd.concat([training_chunk1,training_chunk2,training_chunk3,training_chunk5]).reset_index(drop=True)            # training_chunk4
set3 = pd.concat([training_chunk1,training_chunk2,training_chunk4,training_chunk5]).reset_index(drop=True)            # training_chunk3
set4 = pd.concat([training_chunk1,training_chunk3,training_chunk4,training_chunk5]).reset_index(drop=True)            # training_chunk2
set5 = pd.concat([training_chunk2,training_chunk3,training_chunk4,training_chunk5]).reset_index(drop=True)            # training_chunk1

## 4.1 Classical models

### 4.1.1 SVM

In [86]:
res_dict1 = {}


for c, gamma in zip([100], [0.0001]):
    
    svm_res_dict = {'c':float, 'gamma':float, 'metrics': {}}
    single_res = {'train_accuracy': float, 'test_accuracy': float, 'train_recall': float, 'test_recall': float, 'train_prec': float,
                  'test_prec': float,'train_auc': float, 'test_auc': float, 'train_f1_pos':float, 'test_f1_pos': float, 'train_f1_neg': float, 'test_f1_neg': float}
    
    train_accuracy = []; test_accuracy = []; train_recall = []; test_recall = []; train_prec = []; test_prec = []; train_auc = []; test_auc = []
    train_f1_pos = []; test_f1_pos = []; train_f1_neg = []; test_f1_neg = []

    X_train_ = vec.transform(train['abstract'].values).toarray(); y_train_ = train['label'].values
    X_test_ = vec.transform(test['abstract'].values).toarray(); y_test_ = test['label'].values


    clf1 = SVC(C = c, gamma=gamma)
    clf1.fit(X_train_, y_train_)

    # get the rest of the metrics 
    # do this by dictionary can decrease the amount of code-lines significantly
    
    train_accuracy.append(accuracy_score(y_train_, clf1.predict(X_train_)))
    test_accuracy.append(accuracy_score(y_test_, clf1.predict(X_test_)))

    train_recall.append(recall_score(y_train_, clf1.predict(X_train_)))
    test_recall.append(recall_score(y_test_, clf1.predict(X_test_)))

    train_prec.append(precision_score(y_train_, clf1.predict(X_train_)))
    test_prec.append(precision_score(y_test_, clf1.predict(X_test_)))

    train_auc.append(roc_auc_score(y_train_, clf1.predict(X_train_)))
    test_auc.append(roc_auc_score(y_test_, clf1.predict(X_test_)))

    train_f1_pos.append(f1_score(y_train_, clf1.predict(X_train_)))
    test_f1_pos.append(f1_score(y_test_, clf1.predict(X_test_)))

    train_f1_neg.append(f1_score(y_train_, clf1.predict(X_train_), pos_label=0))
    test_f1_neg.append(f1_score(y_test_, clf1.predict(X_test_), pos_label=0))
        
    for name, values in zip(['train_accuracy', 'test_accuracy', 'train_recall', 'test_recall', 'train_prec', 'test_prec', 'train_auc', 'test_auc', 'train_f1_pos', 'test_f1_pos', 'train_f1_neg', 'test_f1_neg'],
                            [train_accuracy, test_accuracy, train_recall, test_recall, train_prec, test_prec, train_auc, test_auc, train_f1_pos, test_f1_pos, train_f1_neg, test_f1_neg]):
        
        array = np.array(values)
        single_res[name] = np.round(array.mean(), decimals=4)

        
    svm_res_dict['c'] = c
    svm_res_dict['gamma'] = gamma
    svm_res_dict['metrics'] = single_res
    
    res_dict1['1'] = svm_res_dict

In [82]:
precision_score(y_test_, clf1.predict(X_test_))

0.86

In [87]:
hyper_list = []
data_list = []

for j, res in res_dict1.items():
    
    
        
    hyper_list.append((j, res['c'], res['gamma']))

    data_list.append(res['metrics'])

index = pd.MultiIndex.from_tuples(hyper_list, names=['hyper_set', 'C', 'gamma'])
svm_best_df = pd.DataFrame(data_list, index=index)
svm_best_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_accuracy,test_accuracy,train_recall,test_recall,train_prec,test_prec,train_auc,test_auc,train_f1_pos,test_f1_pos,train_f1_neg,test_f1_neg
hyper_set,C,gamma,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,100,0.0001,0.978,0.94,0.9714,0.9412,0.9835,0.9412,0.9779,0.94,0.9774,0.9412,0.9786,0.9388


### 4.1.2 LR

In [138]:
# need shuffling? check the 

res_dict2 = {}

for c, solver in zip([0.1], ['lbfgs']):
    
    lr_res_dict = {'c':float, 'solver':str, 'metrics': {}}
    single_res = {'train_accuracy': float, 'test_accuracy': float, 'train_recall': float, 'test_recall': float, 'train_prec': float,
                  'test_prec': float,'train_auc': float, 'test_auc': float, 'train_f1_pos':float, 'test_f1_pos': float, 'train_f1_neg': float, 'test_f1_neg': float}
    
    train_accuracy = []; test_accuracy = []; train_recall = []; test_recall = []; train_prec = []; test_prec = []; train_auc = []; test_auc = []
    train_f1_pos = []; test_f1_pos = []; train_f1_neg = []; test_f1_neg = []

    X_train_ = vec.transform(train['abstract'].values).toarray(); y_train_ = train['label'].values
    X_test_ = vec.transform(test['abstract'].values).toarray(); y_test_ = test['label'].values


    clf2 = LogisticRegression(C=c, solver=solver)
    clf2.fit(X_train_, y_train_)

    # get the rest of the metrics
    # do this by dictionary can decrease the amount of code-lines significantly
    
    train_accuracy.append(accuracy_score(y_train_, clf2.predict(X_train_)))
    test_accuracy.append(accuracy_score(y_test_, clf2.predict(X_test_)))

    train_recall.append(recall_score(y_train_, clf2.predict(X_train_)))
    test_recall.append(recall_score(y_test_, clf2.predict(X_test_)))

    train_prec.append(precision_score(y_train_, clf2.predict(X_train_)))
    test_prec.append(precision_score(y_test_, clf2.predict(X_test_)))

    train_auc.append(roc_auc_score(y_train_, clf2.predict(X_train_)))
    test_auc.append(roc_auc_score(y_test_, clf2.predict(X_test_)))

    train_f1_pos.append(f1_score(y_train_, clf2.predict(X_train_)))
    test_f1_pos.append(f1_score(y_test_, clf2.predict(X_test_)))

    train_f1_neg.append(f1_score(y_train_, clf2.predict(X_train_), pos_label=0))
    test_f1_neg.append(f1_score(y_test_, clf2.predict(X_test_), pos_label=0))
    
    for name, values in zip(['train_accuracy', 'test_accuracy', 'train_recall', 'test_recall', 'train_prec', 'test_prec', 'train_auc', 'test_auc', 'train_f1_pos', 'test_f1_pos', 'train_f1_neg', 'test_f1_neg'],
                            [train_accuracy, test_accuracy, train_recall, test_recall, train_prec, test_prec, train_auc, test_auc, train_f1_pos, test_f1_pos, train_f1_neg, test_f1_neg]):
        
        array = np.array(values)
        single_res[name] = np.round(array.mean(), decimals=3)
        
        
    lr_res_dict['c'] = c
    lr_res_dict['solver'] = solver
    lr_res_dict['metrics'] = single_res
    
    res_dict2['1'] = lr_res_dict


In [139]:
hyper_list1 = []
data_list1 = []

for j, res in res_dict2.items():
    
    
        
    hyper_list1.append((j, res['c'], res['solver']))

    data_list1.append(res['metrics'])

index = pd.MultiIndex.from_tuples(hyper_list1, names=['hyper_set', 'C', 'solver'])
lr_best_df = pd.DataFrame(data_list1, index=index)
lr_best_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_accuracy,test_accuracy,train_recall,test_recall,train_prec,test_prec,train_auc,test_auc,train_f1_pos,test_f1_pos,train_f1_neg,test_f1_neg
hyper_set,C,solver,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.1,lbfgs,0.984,0.93,0.98,0.941,0.988,0.923,0.984,0.93,0.984,0.932,0.984,0.928


### 4.1.3 RF

In [118]:
res_dict3 = {}

for n_estimators, criterion in zip([1000], ['entropy']):
    
    rf_res_dict = {'n_estimators':int, 'criterion':str, 'metrics': {}}
    single_res = {'train_accuracy': float, 'test_accuracy': float, 'train_recall': float, 'test_recall': float, 'train_prec': float,
                  'test_prec': float,'train_auc': float, 'test_auc': float, 'train_f1_pos':float, 'test_f1_pos': float, 'train_f1_neg': float, 'test_f1_neg': float}
    
    train_accuracy = []; test_accuracy = []; train_recall = []; test_recall = []; train_prec = []; test_prec = []; train_auc = []; test_auc = []
    train_f1_pos = []; test_f1_pos = []; train_f1_neg = []; test_f1_neg = []
    
    X_train_ = vec.transform(train['abstract'].values).toarray(); y_train_ = train['label'].values
    X_test_ = vec.transform(test['abstract'].values).toarray(); y_test_ = test['label'].values


    clf3 = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion)
    clf3.fit(X_train_, y_train_)

    # get the rest of the metrics
    # do this by dictionary can decrease the amount of code-lines significantly
    
    train_accuracy.append(accuracy_score(y_train_, clf3.predict(X_train_)))
    test_accuracy.append(accuracy_score(y_test_, clf3.predict(X_test_)))

    train_recall.append(recall_score(y_train_, clf3.predict(X_train_)))
    test_recall.append(recall_score(y_test_, clf3.predict(X_test_)))

    train_prec.append(precision_score(y_train_, clf3.predict(X_train_)))
    test_prec.append(precision_score(y_test_, clf3.predict(X_test_)))

    train_auc.append(roc_auc_score(y_train_, clf3.predict(X_train_)))
    test_auc.append(roc_auc_score(y_test_, clf3.predict(X_test_)))

    train_f1_pos.append(f1_score(y_train_, clf3.predict(X_train_)))
    test_f1_pos.append(f1_score(y_test_, clf3.predict(X_test_)))

    train_f1_neg.append(f1_score(y_train_, clf3.predict(X_train_), pos_label=0))
    test_f1_neg.append(f1_score(y_test_, clf3.predict(X_test_), pos_label=0))
    
    for name, values in zip(['train_accuracy', 'test_accuracy', 'train_recall', 'test_recall', 'train_prec', 'test_prec', 'train_auc', 'test_auc', 'train_f1_pos', 'test_f1_pos', 'train_f1_neg', 'test_f1_neg'],
                            [train_accuracy, test_accuracy, train_recall, test_recall, train_prec, test_prec, train_auc, test_auc, train_f1_pos, test_f1_pos, train_f1_neg, test_f1_neg]):
        
        array = np.array(values)
        single_res[name] = np.round(array.mean(), decimals=3)
        
        
    rf_res_dict['n_estimators'] = n_estimators
    rf_res_dict['criterion'] = criterion
    rf_res_dict['metrics'] = single_res
    
    res_dict3['1'] = rf_res_dict

In [119]:
hyper_list = []
data_list = []

for j, res in res_dict3.items():
    
    
        
    hyper_list.append((j, res['n_estimators'], res['criterion']))

    data_list.append(res['metrics'])

index = pd.MultiIndex.from_tuples(hyper_list, names=['hyper_set', 'n_estimators', 'criterion'])
rf_best_df = pd.DataFrame(data_list, index=index)
rf_best_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,train_accuracy,test_accuracy,train_recall,test_recall,train_prec,test_prec,train_auc,test_auc,train_f1_pos,test_f1_pos,train_f1_neg,test_f1_neg
hyper_set,n_estimators,criterion,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1000,entropy,1.0,0.95,1.0,0.98,1.0,0.926,1.0,0.949,1.0,0.952,1.0,0.947


## 4.2.1 Neural networks

In [70]:
def get_ann_model(n_hidden, n_units, activation, kernel_regularizer, lr, do, threshold):
    
    

    model = keras.Sequential()
    model.add(keras.layers.InputLayer(input_shape = (1000,)))

    for i in range(n_hidden):
        
        model.add(keras.layers.Dense(n_units, activation=activation, kernel_regularizer=regularizers.L2(kernel_regularizer)))

    model.add(keras.layers.Dropout(do))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
        
    
    model.compile(optimizer = keras.optimizers.Adam(learning_rate=lr),
                      loss='binary_crossentropy', 
                      metrics=['AUC',
                            keras.metrics.BinaryAccuracy(threshold=threshold),
                            keras.metrics.Recall(thresholds=threshold),
                            keras.metrics.Precision(thresholds=threshold)])



    return model

# parametersets
# 1. {'vector_size': 95, 'context_window': 9, 'min_count': 4, 'epochs': 8, 'dim': 224, 'padding': 'pre', 'learning_rate': 0.013121832123684417, 'dropout': 0.4402980193837679, 'threshold': 0.5099871650242483}
# 2. {'vector_size': 98, 'context_window': 8, 'min_count': 4, 'epochs': 7, 'dim': 217, 'padding': 'pre', 'learning_rate': 0.002104272243300588, 'dropout': 0.5665904244287674, 'threshold': 0.4837522534769876}


def test_models(n_hidden, n_units, activation, kernel_regularizer, learning_rate, dropout, threshold):

    res_dict = {}; temp_dict = {}
    train_dict = {}; test_dict = {}

    tr_loss_list = []; tr_auc_list = []; tr_prec_list = []; tr_acc_list = []; tr_rec_list = []; tr_fpos_list = []; tr_fneg_list = []
    te_loss_list = []; te_auc_list = []; te_prec_list = []; te_acc_list = []; te_rec_list = []; te_fpos_list = []; te_fneg_list = []
    
    X = vec.transform(train['abstract'].values).toarray(); y = train['label'].values
    X_test = vec.transform(test['abstract'].values).toarray(); y_test = test['label'].values

    for rs in range(10):
        
        ann_model = get_ann_model(n_hidden = n_hidden,
                                    n_units = n_units,
                                    activation = activation,
                                    kernel_regularizer = kernel_regularizer,
                                    lr=learning_rate, 
                                    do=dropout, 
                                    threshold=threshold)
        
        
        ann_model.fit(X, y, validation_split=0.25, verbose=0, epochs=100, batch_size=100)

        tr_score = ann_model.evaluate(X, y, verbose=0)
        te_score = ann_model.evaluate(X_test, y_test, verbose=0)

        ################# Manually handle the f1-score #####################
        # train
        pred = ann_model.predict(X, verbose=0) # y is true
        f1_train_pos = calculate_f1_score(y, pred, threshold=threshold, pos_label=1)
        f1_trian_neg = calculate_f1_score(y, pred, threshold=threshold, pos_label=0)

        # test
        test_pred = ann_model.predict(X_test, verbose=0)
        f1_test_pos = calculate_f1_score(y_test, test_pred, threshold=threshold, pos_label=1)
        f1_test_neg = calculate_f1_score(y_test, test_pred, threshold=threshold, pos_label=0)
        
        # save the f1-scores
        tr_fpos_list.append(f1_train_pos); tr_fneg_list.append(f1_trian_neg)
        te_fpos_list.append(f1_test_pos); te_fneg_list.append(f1_test_neg)
        
        tr_loss_list.append(tr_score[0]); tr_auc_list.append(tr_score[1]);  tr_acc_list.append(tr_score[2]); tr_rec_list.append(tr_score[3]); tr_prec_list.append(tr_score[4])
        te_loss_list.append(te_score[0]); te_auc_list.append(te_score[1]);  te_acc_list.append(te_score[2]); te_rec_list.append(te_score[3]); te_prec_list.append(te_score[4])
        
    train_dict['loss'] = tr_loss_list; train_dict['auc'] = tr_auc_list; train_dict['prec'] = tr_prec_list; train_dict['acc'] = tr_acc_list; train_dict['rec'] = tr_rec_list; train_dict['f1pos'] = tr_fpos_list; train_dict['f1neg'] = tr_fneg_list
    test_dict['loss'] = te_loss_list; test_dict['auc'] = te_auc_list; test_dict['prec'] = te_prec_list; test_dict['acc'] = te_acc_list; test_dict['rec'] = te_rec_list; test_dict['f1pos'] = te_fpos_list; test_dict['f1neg'] = te_fneg_list
    

    temp_dict['train'] = train_dict
    temp_dict['test'] = test_dict

    
        
    return temp_dict

In [42]:
# filtering, to get the best among the stores "best_trials"

for i in study.best_trials:
    #print(i.params, ',')
    if i.values[0] > 0.80:
        if i.values[1] > 0.80:
            if i.values[2] > 0.80:
                if i.values[3] > 0.80:
                    #print(i.values)
                    print(i.params, ',')

{'n_hidden': 3, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 0.0007970799285309584, 'dropout': 0.20896209008607705, 'threshold': 0.3544065415833555} ,
{'n_hidden': 1, 'n_units': 44, 'kernel_regularizer': 0.0005489530832949244, 'learning_rate': 0.0005834382440750414, 'dropout': 0.48787589688314237, 'threshold': 0.7622802703643394} ,
{'n_hidden': 10, 'n_units': 19, 'kernel_regularizer': 4.8737081393919183e-05, 'learning_rate': 5.56666629147225e-05, 'dropout': 0.6376447344086704, 'threshold': 0.2292333004984978} ,
{'n_hidden': 1, 'n_units': 37, 'kernel_regularizer': 0.0005489530832949244, 'learning_rate': 0.0006521686023472134, 'dropout': 0.2862399358316623, 'threshold': 0.8171312422835556} ,
{'n_hidden': 7, 'n_units': 6, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 0.0013414232848841775, 'dropout': 0.6061345542965496, 'threshold': 0.3544065415833555} ,
{'n_hidden': 9, 'n_units': 44, 'kernel_regularizer': 0.00032005478454991034, 'learning

In [73]:
hyper_list = [{'n_hidden': 3, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 0.0007970799285309584, 'dropout': 0.20896209008607705, 'threshold': 0.3544065415833555} ,
{'n_hidden': 1, 'n_units': 44, 'kernel_regularizer': 0.0005489530832949244, 'learning_rate': 0.0005834382440750414, 'dropout': 0.48787589688314237, 'threshold': 0.7622802703643394} ,
{'n_hidden': 10, 'n_units': 19, 'kernel_regularizer': 4.8737081393919183e-05, 'learning_rate': 5.56666629147225e-05, 'dropout': 0.6376447344086704, 'threshold': 0.2292333004984978} ,
{'n_hidden': 1, 'n_units': 37, 'kernel_regularizer': 0.0005489530832949244, 'learning_rate': 0.0006521686023472134, 'dropout': 0.2862399358316623, 'threshold': 0.8171312422835556} ,
{'n_hidden': 7, 'n_units': 6, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 0.0013414232848841775, 'dropout': 0.6061345542965496, 'threshold': 0.3544065415833555} ,
{'n_hidden': 9, 'n_units': 44, 'kernel_regularizer': 0.00032005478454991034, 'learning_rate': 8.446138571374139e-05, 'dropout': 0.8630165121896143, 'threshold': 0.52263525360727} ,
{'n_hidden': 1, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 0.0007970799285309584, 'dropout': 0.3395203701483684, 'threshold': 0.3544065415833555} ,
{'n_hidden': 9, 'n_units': 54, 'kernel_regularizer': 2.0678394943630145e-05, 'learning_rate': 6.444708842918975e-05, 'dropout': 0.48726918436401034, 'threshold': 0.7581930049210841} ,
{'n_hidden': 3, 'n_units': 44, 'kernel_regularizer': 0.0005489530832949244, 'learning_rate': 0.0005834382440750414, 'dropout': 0.8662788901341136, 'threshold': 0.3056244326515286} ,
{'n_hidden': 1, 'n_units': 19, 'kernel_regularizer': 4.8737081393919183e-05, 'learning_rate': 0.12198972576711176, 'dropout': 0.6376447344086704, 'threshold': 0.3318972015983154} ,
{'n_hidden': 1, 'n_units': 33, 'kernel_regularizer': 1.214740272740579e-05, 'learning_rate': 7.785232427941312e-05, 'dropout': 0.1697866286480649, 'threshold': 0.48870052508813955} ,
{'n_hidden': 1, 'n_units': 34, 'kernel_regularizer': 0.019189432566192953, 'learning_rate': 0.0007970799285309584, 'dropout': 0.29992301899310914, 'threshold': 0.25478675673102114} ,
{'n_hidden': 3, 'n_units': 18, 'kernel_regularizer': 0.00023198013313174801, 'learning_rate': 7.015653993460166e-05, 'dropout': 0.11174948366441749, 'threshold': 0.31665717730358856} ,
{'n_hidden': 1, 'n_units': 37, 'kernel_regularizer': 0.0005489530832949244, 'learning_rate': 0.0006521686023472134, 'dropout': 0.048525896101412724, 'threshold': 0.27265874826006264} ,
{'n_hidden': 1, 'n_units': 16, 'kernel_regularizer': 1.214740272740579e-05, 'learning_rate': 0.0007582681194411686, 'dropout': 0.37889332143468546, 'threshold': 0.3923850775943305} ,
{'n_hidden': 2, 'n_units': 11, 'kernel_regularizer': 0.007872536731619325, 'learning_rate': 0.00039626071240343367, 'dropout': 0.8568014240464407, 'threshold': 0.10873515186137227} ,
{'n_hidden': 3, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 0.0007970799285309584, 'dropout': 0.20896209008607705, 'threshold': 0.3544065415833555} ,
{'n_hidden': 1, 'n_units': 6, 'kernel_regularizer': 5.525232660110897e-05, 'learning_rate': 0.6699399471845631, 'dropout': 0.6061345542965496, 'threshold': 0.3318972015983154} ,
{'n_hidden': 3, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 7.785232427941312e-05, 'dropout': 0.20896209008607705, 'threshold': 0.48870052508813955} ,
{'n_hidden': 1, 'n_units': 34, 'kernel_regularizer': 0.0005069808946385528, 'learning_rate': 0.021296428950781383, 'dropout': 0.8662788901341136, 'threshold': 0.3544065415833555} ,
{'n_hidden': 9, 'n_units': 8, 'kernel_regularizer': 0.0004416315616331735, 'learning_rate': 0.00020655775452336643, 'dropout': 0.5039281376641517, 'threshold': 0.4195383350653711} ,
{'n_hidden': 3, 'n_units': 18, 'kernel_regularizer': 6.428253014937363e-05, 'learning_rate': 7.015653993460166e-05, 'dropout': 0.7860688156703391, 'threshold': 0.1325102969894017} ,
{'n_hidden': 9, 'n_units': 44, 'kernel_regularizer': 1.8148712421091602e-05, 'learning_rate': 0.002389600411589469, 'dropout': 0.48787589688314237, 'threshold': 0.3814482156016251} ,
{'n_hidden': 1, 'n_units': 25, 'kernel_regularizer': 4.8737081393919183e-05, 'learning_rate': 9.988480575396382e-05, 'dropout': 0.5499841382958144, 'threshold': 0.27265874826006264} ,
{'n_hidden': 1, 'n_units': 25, 'kernel_regularizer': 0.019189432566192953, 'learning_rate': 0.0007970799285309584, 'dropout': 0.29992301899310914, 'threshold': 0.25478675673102114} ,
{'n_hidden': 1, 'n_units': 64, 'kernel_regularizer': 0.0007793010643204281, 'learning_rate': 0.0003795051194324455, 'dropout': 0.7010918993652716, 'threshold': 0.3056244326515286} ,
{'n_hidden': 1, 'n_units': 44, 'kernel_regularizer': 0.0005489530832949244, 'learning_rate': 0.0007582681194411686, 'dropout': 0.37889332143468546, 'threshold': 0.1325102969894017} ,
{'n_hidden': 7, 'n_units': 25, 'kernel_regularizer': 4.8737081393919183e-05, 'learning_rate': 9.988480575396382e-05, 'dropout': 0.5499841382958144, 'threshold': 0.27265874826006264} ,
{'n_hidden': 1, 'n_units': 16, 'kernel_regularizer': 0.0005489530832949244, 'learning_rate': 0.0007582681194411686, 'dropout': 0.37889332143468546, 'threshold': 0.3923850775943305} ,
{'n_hidden': 3, 'n_units': 44, 'kernel_regularizer': 0.0005489530832949244, 'learning_rate': 5.3577811386515144e-05, 'dropout': 0.6571075663804247, 'threshold': 0.3056244326515286} ,
{'n_hidden': 7, 'n_units': 14, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 0.0006955292277375732, 'dropout': 0.8959560597731027, 'threshold': 0.3544065415833555} ,
{'n_hidden': 3, 'n_units': 56, 'kernel_regularizer': 2.1928352979948724e-05, 'learning_rate': 0.0012602416730026099, 'dropout': 0.0686413637905968, 'threshold': 0.4195383350653711} ,
{'n_hidden': 1, 'n_units': 16, 'kernel_regularizer': 0.0017899630867169573, 'learning_rate': 0.0005834382440750414, 'dropout': 0.7860688156703391, 'threshold': 0.7347123009931876} ,
{'n_hidden': 1, 'n_units': 16, 'kernel_regularizer': 5.525232660110897e-05, 'learning_rate': 0.6699399471845631, 'dropout': 0.6061345542965496, 'threshold': 0.1325102969894017} ,
{'n_hidden': 5, 'n_units': 33, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 0.001598284971321218, 'dropout': 0.20902702870023096, 'threshold': 0.3544065415833555} ,
{'n_hidden': 10, 'n_units': 44, 'kernel_regularizer': 0.00017468471331461992, 'learning_rate': 0.00012248524389842045, 'dropout': 0.3395203701483684, 'threshold': 0.8929033947032199} ,
{'n_hidden': 2, 'n_units': 42, 'kernel_regularizer': 0.007872536731619325, 'learning_rate': 4.9567167614517136e-05, 'dropout': 0.08891052102456992, 'threshold': 0.2292333004984978} ,
{'n_hidden': 2, 'n_units': 44, 'kernel_regularizer': 1.1874693702272719e-05, 'learning_rate': 0.0007970799285309584, 'dropout': 0.20896209008607705, 'threshold': 0.8929033947032199} ,
{'n_hidden': 1, 'n_units': 25, 'kernel_regularizer': 0.12254567223517063, 'learning_rate': 0.003763855754353628, 'dropout': 0.8662788901341136, 'threshold': 0.3544065415833555} ,
{'n_hidden': 1, 'n_units': 58, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 0.0003795051194324455, 'dropout': 0.20896209008607705, 'threshold': 0.383494481615293} ,
{'n_hidden': 7, 'n_units': 14, 'kernel_regularizer': 4.8737081393919183e-05, 'learning_rate': 4.9567167614517136e-05, 'dropout': 0.545499225702559, 'threshold': 0.2292333004984978}]

adjusting_best = [{'n_hidden': 3, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 7.785232427941312e-05, 'dropout': 0.20896209008607705, 'threshold': 0.58},
    {'n_hidden': 3, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-06, 'learning_rate': 7.785232427941312e-05, 'dropout': 0.20896209008607705, 'threshold': 0.58} ,
{'n_hidden': 3, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-07, 'learning_rate': 7.785232427941312e-05, 'dropout': 0.20896209008607705, 'threshold': 0.58} ,
{'n_hidden': 3, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-08, 'learning_rate': 7.785232427941312e-05, 'dropout': 0.20896209008607705, 'threshold': 0.58} ,
{'n_hidden': 3, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-09, 'learning_rate': 7.785232427941312e-05, 'dropout': 0.20896209008607705, 'threshold': 0.58},
{'n_hidden': 3, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-10, 'learning_rate': 7.785232427941312e-05, 'dropout': 0.20896209008607705, 'threshold': 0.58}
]

hyper_dict2 = {}


for index, h in enumerate(adjusting_best):
    h['activation'] = 'relu'
    print('current_para_set', h)
    hyper_dict2['set_' + str(index)] = test_models(n_hidden = h['n_hidden'],
                                                   n_units = h['n_units'],
                                                   activation = h['activation'],
                                                   kernel_regularizer = h['kernel_regularizer'],
                                                   learning_rate=h['learning_rate'],
                                                   dropout=h['dropout'],
                                                   threshold=h['threshold'],
                                                   )


    print(hyper_dict2['set_' + str(index)])

# selected
# {'n_hidden': 5, 'n_units': 33, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 0.001598284971321218, 'dropout': 0.20902702870023096, 'threshold': 0.3544065415833555} ,

# set_18 mbs

current_para_set {'n_hidden': 3, 'n_units': 34, 'kernel_regularizer': 1.7151027836617935e-05, 'learning_rate': 7.785232427941312e-05, 'dropout': 0.20896209008607705, 'threshold': 0.58, 'activation': 'relu'}
{'train': {'loss': [0.18023273348808289, 0.2054823637008667, 0.1593029499053955, 0.1882626861333847, 0.17200042307376862, 0.18379145860671997, 0.17650265991687775, 0.16214197874069214, 0.20090630650520325, 0.1741788238286972], 'auc': [0.9895238876342773, 0.9814726114273071, 0.9923569560050964, 0.9848579168319702, 0.9895398616790771, 0.9850980043411255, 0.9871628880500793, 0.9904442429542542, 0.9867626428604126, 0.9893397092819214], 'prec': [0.9702127575874329, 0.949999988079071, 0.9714285731315613, 0.9666666388511658, 0.9628099203109741, 0.9539749026298523, 0.9708333611488342, 0.9435483813285828, 0.9502074718475342, 0.9629629850387573], 'acc': [0.9520000219345093, 0.9419999718666077, 0.972000002861023, 0.9580000042915344, 0.9580000042915344, 0.9440000057220459, 0.9620000123977661, 0

In [75]:
for key, value in hyper_dict2.items():
    print(key)
    for tt, value2 in value.items():
        print(tt)
        for metric, value3 in value2.items():
            print(metric, np.array(value3).mean(), '+-',sem(np.array(value3)))

set_0
train
loss 0.18028023838996887 +- 0.004744269054919387
auc 0.9876558721065521 +- 0.0010186824484891702
prec 0.9602644979953766 +- 0.0031894709254382606
acc 0.9541999995708466 +- 0.0030177264587789247
rec 0.9457142889499665 +- 0.004347356610199673
f1pos 0.9528881426194976 +- 0.003131360908950238
f1neg 0.9554368479658146 +- 0.0029151112008980852
test
loss 0.3377810657024384 +- 0.012021977738088352
auc 0.9414365589618683 +- 0.005732373852178217
prec 0.895765197277069 +- 0.011098424390781791
acc 0.8769999980926514 +- 0.006506408018812931
rec 0.8607843220233917 +- 0.00944890943340301
f1pos 0.8771939436774773 +- 0.006136925300779733
f1neg 0.8765700939458714 +- 0.007203247441562328
set_1
train
loss 0.17324524223804474 +- 0.008209747993733361
auc 0.9894917666912079 +- 0.0012522933638226788
prec 0.960617071390152 +- 0.002616009408631397
acc 0.9535999953746795 +- 0.001995553322881881
rec 0.9440816342830658 +- 0.0033904553966774406
f1pos 0.9522315757830476 +- 0.0020802958520988015
f1neg 0.9