In [2]:
# data
import pandas as pd
import numpy as np
import csv

# misc
import os
import re
import time
import ast
import warnings
import math
import copy
import matplotlib.pyplot as plt

# ML
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn import metrics
import scikitplot as skplt

from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from tensorflow.keras import layers

In [None]:
from google.colab import drive 
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Datasets')

In [None]:
df = pd.read_excel("df_final.xlsx")

In [None]:
df = df[['Headline', # not a feature for training
    'Bias',
 'glove_vec300_norm',
 'tf_idf',
 'negative',
 'positive',
 'bias_lex_h',
'bias_lex_r',
 'assertives',
 'factives',
 'report_verbs',
 'implicatives',
 'hedges',
 'affect (Affect)',
 'posemo (Positive Emotions)',
 'negemo (Negative Emotions)',
 'anx (Anx)',
 'anger (Anger)',
 'sad (Sad)',
 'social (Social)',
 'family (Family)',
 'friend (Friends)',
 'female (Female)',
 'male (Male)',
 'cogproc (Cognitive Processes)',
 'insight (Insight)',
 'cause (Causal)',
 'discrep (Discrepancies)',
 'tentat (Tentative)',
 'certain (Certainty)',
 'differ (Differentiation)',
 'percept (Perceptual Processes)',
 'see (See)',
 'hear (Hear)',
 'feel (Feel)',
 'bio (Biological Processes)',
 'body (Body)',
 'health (Health)',
 'sexual (Sexual)',
 'ingest (Ingest)',
 'drives (Drives)',
 'affiliation (Affiliation)',
 'achieve (Achievement)',
 'power (Power)',
 'reward (Reward)',
 'risk (Risk)',
 'focuspast (Past Focus)',
 'focuspresent (Present Focus)',
 'focusfuture (Future Focus)',
 'relativ (Relativity)',
 'motion (Motion)',
 'space (Space)',
 'time (Time)',
 'work (Work)',
 'leisure (Leisure)',
 'home (Home)',
 'money (Money)',
 'relig (Religion)',
 'death (Death)',
 'informal (Informal Language)',
 'swear (Swear)',
 'netspeak (Netspeak)',
 'assent (Assent)',
 'nonflu (Nonfluencies)',
 'filler (Filler Words)',
 'pos_ADJ',
 'pos_ADP',
 'pos_ADV',
 'pos_AUX',
 'pos_DET',
 'pos_INTJ',
 'pos_NOUN',
 'pos_PRON',
 'pos_PROPN',
 'pos_SCONJ',
 'pos_VERB',
 'pos_X',
 'dep_ROOT',
 'dep_acl',
 'dep_acomp',
 'dep_advcl',
 'dep_advmod',
 'dep_agent',
 'dep_amod',
 'dep_appos',
 'dep_attr',
 'dep_aux',
 'dep_auxpass',
 'dep_cc',
 'dep_ccomp',
 'dep_compound',
 'dep_conj',
 'dep_csubj',
 'dep_dative',
 'dep_dep',
 'dep_det',
 'dep_dobj',
 'dep_expl',
 'dep_intj',
 'dep_mark',
 'dep_neg',
 'dep_nmod',
 'dep_npadvmod',
 'dep_nsubj',
 'dep_nsubjpass',
 'dep_nummod',
 'dep_oprd',
 'dep_parataxis',
 'dep_pcomp',
 'dep_pobj',
 'dep_poss',
 'dep_preconj',
 'dep_predet',
 'dep_prep',
 'dep_prt',
 'dep_punct',
 'dep_quantmod',
 'dep_relcl',
 'dep_xcomp',
 'ne_CARDINAL',
 'ne_DATE',
 'ne_EVENT',
 'ne_FAC',
 'ne_GPE',
 'ne_LANGUAGE',
 'ne_LAW',
 'ne_LOC',
 'ne_MONEY',
 'ne_NORP',
 'ne_ORDINAL',
 'ne_ORG',
 'ne_PERCENT',
 'ne_PERSON',
 'ne_PRODUCT',
 'ne_QUANTITY',
 'ne_TIME',
 'ne_WORK_OF_ART',
 'negative_context',
 'positive_context',
 'bias_lex_h_context',
'bias_lex_r_context',
 'assertives_context',
 'factives_context',
 'report_verbs_context',
 'implicatives_context',
 'hedges_context',
'affect (Affect)_context',
 'posemo (Positive Emotions)_context',
 'negemo (Negative Emotions)_context',
 'anx (Anx)_context',
 'anger (Anger)_context',
 'sad (Sad)_context',
 'social (Social)_context',
 'family (Family)_context',
 'friend (Friends)_context',
 'female (Female)_context',
 'male (Male)_context',
 'cogproc (Cognitive Processes)_context',
 'insight (Insight)_context',
 'cause (Causal)_context',
 'discrep (Discrepancies)_context',
 'tentat (Tentative)_context',
 'certain (Certainty)_context',
 'differ (Differentiation)_context',
 'percept (Perceptual Processes)_context',
 'see (See)_context',
 'hear (Hear)_context',
 'feel (Feel)_context',
 'bio (Biological Processes)_context',
 'body (Body)_context',
 'health (Health)_context',
 'sexual (Sexual)_context',
 'ingest (Ingest)_context',
 'drives (Drives)_context',
 'affiliation (Affiliation)_context',
 'achieve (Achievement)_context',
 'power (Power)_context',
 'reward (Reward)_context',
 'risk (Risk)_context',
 'focuspast (Past Focus)_context',
 'focuspresent (Present Focus)_context',
 'focusfuture (Future Focus)_context',
 'relativ (Relativity)_context',
 'motion (Motion)_context',
 'space (Space)_context',
 'time (Time)_context',
 'work (Work)_context',
 'leisure (Leisure)_context',
 'home (Home)_context',
 'money (Money)_context',
 'relig (Religion)_context',
 'death (Death)_context',
 'informal (Informal Language)_context',
 'swear (Swear)_context',
 'netspeak (Netspeak)_context',
 'assent (Assent)_context',
 'nonflu (Nonfluencies)_context',
 'filler (Filler Words)_context',
 'pos_ADJ_context',
 'pos_ADP_context',
 'pos_ADV_context',
 'pos_AUX_context',
 'pos_DET_context',
 'pos_INTJ_context',
 'pos_NOUN_context',
 'pos_PRON_context',
 'pos_PROPN_context',
 'pos_SCONJ_context',
 'pos_VERB_context',
 'pos_X_context',
 'dep_ROOT_context',
 'dep_acl_context',
 'dep_acomp_context',
 'dep_advcl_context',
 'dep_advmod_context',
 'dep_agent_context',
 'dep_amod_context',
 'dep_appos_context',
 'dep_attr_context',
 'dep_aux_context',
 'dep_auxpass_context',
 'dep_cc_context',
 'dep_ccomp_context',
 'dep_compound_context',
 'dep_conj_context',
 'dep_csubj_context',
 'dep_dative_context',
 'dep_dep_context',
 'dep_det_context',
 'dep_dobj_context',
 'dep_expl_context',
 'dep_intj_context',
 'dep_mark_context',
 'dep_neg_context',
 'dep_nmod_context',
 'dep_npadvmod_context',
 'dep_nsubj_context',
 'dep_nsubjpass_context',
 'dep_nummod_context',
 'dep_oprd_context',
 'dep_parataxis_context',
 'dep_pcomp_context',
 'dep_pobj_context',
 'dep_poss_context',
 'dep_preconj_context',
 'dep_predet_context',
 'dep_prep_context',
 'dep_prt_context',
 'dep_punct_context',
 'dep_quantmod_context',
 'dep_relcl_context',
 'dep_xcomp_context',
 'ne_CARDINAL_context',
 'ne_DATE_context',
 'ne_EVENT_context',
 'ne_FAC_context',
 'ne_GPE_context',
 'ne_LAW_context',
 'ne_LOC_context',
 'ne_MONEY_context',
 'ne_NORP_context',
 'ne_ORDINAL_context',
 'ne_ORG_context',
 'ne_PERCENT_context',
 'ne_PERSON_context',
 'ne_PRODUCT_context',
 'ne_QUANTITY_context',
 'ne_TIME_context',
 'ne_WORK_OF_ART_context',
 'ne_LANGUAGE_context']]

In [None]:
df["Label"] == df.Bias.apply(lambda x: 1 if x == "AllSides Media Bias Rating: Left" or x == "AllSides Media Bias Rating: Right" else (0 if x == "AllSides Media Bias Rating: Center" else -99))

In [None]:
df = df[df["Label"] > -50]

In [None]:
x = df.drop(['Label'], 1)
y = df[['Label']]

In [None]:

x_scaled = copy.deepcopy(x)

scaler = MinMaxScaler(feature_range=(0, 1))
x_scaled['glove_vec300_norm'] = scaler.fit_transform(x_scaled[['glove_vec300_norm']])
x_scaled['tf_idf'] = scaler.fit_transform(x_scaled[['tf_idf']])

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(x, y, 
                                                                            test_size = 0.20, random_state = 42)
train_features_sc, test_features_sc, train_labels, test_labels = train_test_split(x_scaled, y, 
                                                                            test_size = 0.20, random_state = 42)

In [None]:
sorted(metrics.SCORERS.keys())

In [None]:
scoring = ['accuracy', 'f1', 'roc_auc', 'precision', 'recall']

In [None]:
print('Biased words:',round(len(y[y['Label']==1])/len(y)*100,0),'%')
print('Biased words:',round(len(y[y['Label']==0])/len(y)*100,0),'%')

In [None]:
logreg_w = LogisticRegressionCV(random_state=42, class_weight = {0:10, 1:90})
scores_logreg_w = cross_validate(logreg_w, x_scaled, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_logreg_w['test_f1'].mean(), scores_logreg_w['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_logreg_w['test_precision'].mean(), scores_logreg_w['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_logreg_w['test_recall'].mean(), scores_logreg_w['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_logreg_w['test_roc_auc'].mean(), scores_logreg_w['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_logreg_w['test_accuracy'].mean(), scores_logreg_w['test_accuracy'].std() * 2))

In [None]:
pred_logreg_w_cm = logreg_w.fit(train_features_sc, train_labels).predict(test_features_sc)
pred_binary_logreg_w_cm = pred_logreg_w_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_logreg_w_cm).transpose())

In [None]:
lda = LinearDiscriminantAnalysis()
scores_lda = cross_validate(lda, x_scaled, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_lda['test_f1'].mean(), scores_lda['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_lda['test_precision'].mean(), scores_lda['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_lda['test_recall'].mean(), scores_lda['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_lda['test_roc_auc'].mean(), scores_lda['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_lda['test_accuracy'].mean(), scores_lda['test_accuracy'].std() * 2))

In [None]:
pred_lda_cm = lda.fit(train_features_sc, train_labels).predict(test_features_sc)
pred_binary_lda_cm = pred_lda_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_lda_cm).transpose())

In [None]:
qda = QuadraticDiscriminantAnalysis()
scores_qda = cross_validate(qda, x_scaled, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_qda['test_f1'].mean(), scores_qda['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_qda['test_precision'].mean(), scores_qda['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_qda['test_recall'].mean(), scores_qda['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_qda['test_roc_auc'].mean(), scores_qda['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_qda['test_accuracy'].mean(), scores_qda['test_accuracy'].std() * 2))

In [None]:
pred_qda_cm = qda.fit(train_features_sc, train_labels).predict(test_features_sc)
pred_binary_qda_cm = pred_qda_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_qda_cm).transpose())

In [None]:
nb = GaussianNB()
scores_nb = cross_validate(nb, x_scaled, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_nb['test_f1'].mean(), scores_nb['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_nb['test_precision'].mean(), scores_nb['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_nb['test_recall'].mean(), scores_nb['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_nb['test_roc_auc'].mean(), scores_nb['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_nb['test_accuracy'].mean(), scores_nb['test_accuracy'].std() * 2))

In [None]:
pred_nb_cm = nb.fit(train_features_sc, train_labels).predict(test_features_sc)
pred_binary_nb_cm = pred_nb_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_nb_cm).transpose())

In [None]:
nb_imb = ComplementNB()
scores_nb_imb = cross_validate(nb_imb, x_scaled, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_nb_imb['test_f1'].mean(), scores_nb_imb['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_nb_imb['test_precision'].mean(), scores_nb_imb['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_nb_imb['test_recall'].mean(), scores_nb_imb['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_nb_imb['test_roc_auc'].mean(), scores_nb_imb['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_nb_imb['test_accuracy'].mean(), scores_nb_imb['test_accuracy'].std() * 2))

In [None]:
pred_nb_imb_cm = nb_imb.fit(train_features_sc, train_labels).predict(test_features_sc)
pred_binary_nb_imb_cm = pred_nb_imb_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_nb_imb_cm).transpose())

In [None]:
knn = KNeighborsClassifier()
scores_knn = cross_validate(knn, x_scaled, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_knn['test_f1'].mean(), scores_knn['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_knn['test_precision'].mean(), scores_knn['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_knn['test_recall'].mean(), scores_knn['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_knn['test_roc_auc'].mean(), scores_knn['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_knn['test_accuracy'].mean(), scores_knn['test_accuracy'].std() * 2))

In [None]:
pred_knn_cm = knn.fit(train_features_sc, train_labels).predict(test_features_sc)
pred_binary_knn_cm = pred_knn_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_knn_cm).transpose())

In [None]:
dt = DecisionTreeClassifier(random_state=42)
scores_dt = cross_validate(dt, x, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_dt['test_f1'].mean(), scores_dt['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_dt['test_precision'].mean(), scores_dt['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_dt['test_recall'].mean(), scores_dt['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_dt['test_roc_auc'].mean(), scores_dt['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_dt['test_accuracy'].mean(), scores_dt['test_accuracy'].std() * 2))

In [None]:
pred_dt_cm = dt.fit(train_features, train_labels).predict(test_features)
pred_binary_dt_cm = pred_dt_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_dt_cm).transpose())

In [None]:
dt_w = DecisionTreeClassifier(random_state=42, class_weight = {0:10, 1:90})
scores_dt_w = cross_validate(dt_w, x, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_dt_w['test_f1'].mean(), scores_dt_w['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_dt_w['test_precision'].mean(), scores_dt_w['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_dt_w['test_recall'].mean(), scores_dt_w['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_dt_w['test_roc_auc'].mean(), scores_dt_w['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_dt_w['test_accuracy'].mean(), scores_dt_w['test_accuracy'].std() * 2))

In [None]:
pred_dt_w_cm = dt_w.fit(train_features, train_labels).predict(test_features)
pred_binary_dt_w_cm = pred_dt_w_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_dt_w_cm).transpose())

In [None]:
rf = RandomForestClassifier(random_state=42, n_estimators=100)
scores_rf = cross_validate(rf, x, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_rf['test_f1'].mean(), scores_rf['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_rf['test_precision'].mean(), scores_rf['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_rf['test_recall'].mean(), scores_rf['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_rf['test_roc_auc'].mean(), scores_rf['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rf['test_accuracy'].mean(), scores_rf['test_accuracy'].std() * 2))

In [None]:
pred_rf_cm = rf.fit(train_features, train_labels).predict(test_features)
pred_binary_rf_cm = pred_rf_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_rf_cm).transpose())

In [None]:
rf_w = RandomForestClassifier(random_state=42, class_weight = {0:10, 1:90}, n_estimators=100)
scores_rf_w = cross_validate(rf_w, x, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_rf_w['test_f1'].mean(), scores_rf_w['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_rf_w['test_precision'].mean(), scores_rf_w['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_rf_w['test_recall'].mean(), scores_rf_w['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_rf_w['test_roc_auc'].mean(), scores_rf_w['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rf_w['test_accuracy'].mean(), scores_rf_w['test_accuracy'].std() * 2))

In [None]:
pred_rf_w_cm = rf_w.fit(train_features, train_labels).predict(test_features)
pred_binary_rf_w_cm = pred_rf_w_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_rf_w_cm).transpose())

In [None]:
svm = SVC(kernel='linear', random_state=42)
scores_svm = cross_validate(svm, x_scaled, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_svm['test_f1'].mean(), scores_svm['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_svm['test_precision'].mean(), scores_svm['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_svm['test_recall'].mean(), scores_svm['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_svm['test_roc_auc'].mean(), scores_svm['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_svm['test_accuracy'].mean(), scores_svm['test_accuracy'].std() * 2))

In [None]:
pred_svm_cm = svm.fit(train_features_sc, train_labels).predict(test_features_sc)
pred_binary_svm_cm = pred_svm_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_svm_cm).transpose())

In [None]:
start_time = time.time()
svm_w = SVC(kernel='linear', random_state=42, class_weight = {0:10, 1:90})
scores_svm_w = cross_validate(svm_w, x_scaled, y, cv=5, scoring=scoring)
end_time = time.time()
print('Time to train weighted linear SVM on all features, 10-fold CV:', round((end_time-start_time),2), 'seconds')

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_svm_w['test_f1'].mean(), scores_svm_w['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_svm_w['test_precision'].mean(), scores_svm_w['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_svm_w['test_recall'].mean(), scores_svm_w['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_svm_w['test_roc_auc'].mean(), scores_svm_w['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_svm_w['test_accuracy'].mean(), scores_svm_w['test_accuracy'].std() * 2))

In [None]:
start_time = time.time()
pred_svm_w_cm = svm_w.fit(train_features_sc, train_labels).predict(test_features_sc)
pred_binary_svm_w_cm = pred_svm_w_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_svm_w_cm).transpose())
end_time = time.time()
print('Time to train weighted linear SVM on all features, 10-fold CV:', round((end_time-start_time),2), 'seconds')

In [None]:
start_time = time.time()
svm_w = SVC(random_state=42, class_weight = {0:10, 1:90})
scores_svm_w = cross_validate(svm_w, x_scaled, y, cv=5, scoring=scoring)
end_time = time.time()
print('Time to train weighted linear SVM on all features, 10-fold CV:', round((end_time-start_time),2), 'seconds')

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_svm_w['test_f1'].mean(), scores_svm_w['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_svm_w['test_precision'].mean(), scores_svm_w['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_svm_w['test_recall'].mean(), scores_svm_w['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_svm_w['test_roc_auc'].mean(), scores_svm_w['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_svm_w['test_accuracy'].mean(), scores_svm_w['test_accuracy'].std() * 2))

In [None]:
start_time = time.time()
pred_svm_w_cm = svm_w.fit(train_features_sc, train_labels).predict(test_features_sc)
pred_binary_svm_w_cm = pred_svm_w_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_svm_w_cm).transpose())
end_time = time.time()
print('Time to train weighted linear SVM on all features, 10-fold CV:', round((end_time-start_time),2), 'seconds')

In [None]:
xgb = xgboost.XGBClassifier(objective="binary:logistic", random_state=42)
scores_xgb = cross_validate(xgb, x, y, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_xgb['test_f1'].mean(), scores_xgb['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_xgb['test_precision'].mean(), scores_xgb['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_xgb['test_recall'].mean(), scores_xgb['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_xgb['test_roc_auc'].mean(), scores_xgb['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_xgb['test_accuracy'].mean(), scores_xgb['test_accuracy'].std() * 2))

In [None]:
pred_xgb_cm = xgb.fit(train_features, train_labels).predict(test_features)
pred_binary_xgb_cm = pred_xgb_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_xgb_cm).transpose())

In [None]:

scale_pos_weight = len(y[y['label4']==0])/len(y[y['Label']==1])
scale_pos_weight

In [None]:
xgb_w = xgboost.XGBClassifier(objective="binary:logistic", scale_pos_weight=scale_pos_weight, random_state=42)
scores_xgb_w = cross_validate(xgb_w, x, y, cv=10, scoring=scoring)

In [None]:

print("F1: %0.2f (+/- %0.2f)" % (scores_xgb_w['test_f1'].mean(), scores_xgb_w['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_xgb_w['test_precision'].mean(), scores_xgb_w['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_xgb_w['test_recall'].mean(), scores_xgb_w['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_xgb_w['test_roc_auc'].mean(), scores_xgb_w['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_xgb_w['test_accuracy'].mean(), scores_xgb_w['test_accuracy'].std() * 2))

In [None]:
pred_xgb_w_cm = xgb_w.fit(train_features, train_labels).predict(test_features)
pred_binary_xgb_w_cm = pred_xgb_w_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_xgb_w_cm).transpose())

In [None]:
mlp = MLPClassifier(random_state=42)
scores_mlp = cross_validate(mlp, x_scaled, y4, cv=10, scoring=scoring)

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (scores_mlp['test_f1'].mean(), scores_mlp['test_f1'].std() * 2))
print("Precision: %0.2f (+/- %0.2f)" % (scores_mlp['test_precision'].mean(), scores_mlp['test_precision'].std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (scores_mlp['test_recall'].mean(), scores_mlp['test_recall'].std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_mlp['test_roc_auc'].mean(), scores_mlp['test_roc_auc'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_mlp['test_accuracy'].mean(), scores_mlp['test_accuracy'].std() * 2))

In [None]:
pred_mlp_cm = mlp.fit(train_features_sc, train_labels).predict(test_features_sc)
pred_binary_mlp_cm = pred_mlp_cm.round()
print('Confusion matrix:\n', metrics.confusion_matrix(test_labels, pred_binary_mlp_cm).transpose())

In [None]:
def define_model(n_input):
    # define model
    mlp_w = Sequential()
    # define first hidden layer and visible layer
    mlp_w.add(layers.Dense(100, input_dim=n_input, activation='relu'))
    # define output layer
    mlp_w.add(layers.Dense(1, activation='sigmoid'))
    # define loss and optimizer
    mlp_w.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC','Precision','Recall'])
    return mlp_w

In [1]:
# K-fold Cross Validation model evaluation
start_time = time.time()

num_folds = 10
fold_no = 1
F1_per_fold = []
precision_per_fold = []
recall_per_fold = []
AUC_per_fold = []

kf = KFold(n_splits=10, random_state=42, shuffle=True)

x_scaled_np = x_scaled.to_numpy()
y_np = y.to_numpy()

for train_index, test_index in kf.split(x_scaled_np):
    X_train, X_test = x_scaled_np[train_index], x_scaled_np[test_index]
    y_train, y_test = y_np[train_index], y_np[test_index]

    # Generate a print
    print(f'Training for fold {fold_no} ...')
    
    mlp_w = define_model(len(feature_list))

    # Fit data to model
    mlp_w.fit(X_train, y_train, epochs=200, class_weight={0:10, 1:90})
    mlp_w_pred = mlp_w.predict(X_test)
    mlp_w_pred_bin = mlp_w_pred.round()
    
    F1_per_fold.append(metrics.f1_score(y_test, mlp_w_pred_bin, pos_label=1))
    precision_per_fold.append(metrics.precision_score(y_test, mlp_w_pred_bin, pos_label=1))
    recall_per_fold.append(metrics.recall_score(y_test, mlp_w_pred_bin, pos_label=1))
    AUC_per_fold.append(metrics.roc_auc_score(y_test, mlp_w_pred_bin))
    print('CF:', metrics.confusion_matrix(y_test, mlp_w_pred_bin).transpose())

    # Increase fold number
    fold_no = fold_no + 1

end_time = time.time()
print('Time to train weighted MLP on all features, 10-fold CV:', round((end_time-start_time),2), 'seconds')


NameError: name 'time' is not defined

In [None]:
print("F1: %0.2f (+/- %0.2f)" % (statistics.mean(F1_per_fold), statistics.pstdev(F1_per_fold) * 2))
print("Precision: %0.2f (+/- %0.2f)" % (statistics.mean(precision_per_fold), statistics.pstdev(precision_per_fold) * 2))
print("Recall: %0.2f (+/- %0.2f)" % (statistics.mean(recall_per_fold), statistics.pstdev(recall_per_fold) * 2))
print("AUC: %0.2f (+/- %0.2f)" % (statistics.mean(AUC_per_fold), statistics.pstdev(AUC_per_fold) * 2))