In [20]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import LocallyLinearEmbedding
from xgboost import XGBClassifier, plot_importance
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

patient_id = 1

val_csv_path = "D:\\Faculdade\\TCC\\dados\\epilepsy_ecosystem\\extracted_features_val.csv"

df_val = pd.read_csv(val_csv_path)

if patient_id:
    train_csv_path = "D:\\Faculdade\\TCC\\dados\\epilepsy_ecosystem\\selected_features_Pat{}Train.csv".format(patient_id)
    df_val = df_val.loc[df_val['patient'] == patient_id]
else:
    train_csv_path = "D:\\Faculdade\\TCC\\dados\\epilepsy_ecosystem\\selected_features.csv"

df_train = pd.read_csv(train_csv_path)
df_train = df_train[df_train.columns[1:]]

df_val = df_val[df_train.columns] # ignoring index column

print(df_train, df_val)
df_train = df_train.drop(columns=['patient','segment_id'])


X_train = df_train.drop(columns=['class'])
y_train = df_train['class']


       ch_0_skewness  ch_0_delta_skewness  ch_0_delta2_skewness  \
0          -0.623373            -0.227471             -2.042072   
1          -0.230622            -0.480491             -3.000496   
2          -0.529573            -0.093873             -3.163419   
3          -0.120593            -0.051202             -0.527124   
4          -0.343849            -0.196993             -0.712003   
...              ...                  ...                   ...   
14875      -0.018994             0.009518             -2.368109   
14876       0.002107             0.163481             -2.699552   
14877       0.104941            -0.073037             -2.366628   
14878      -0.086292            -0.060643             -3.405448   
14879      -0.236192            -0.103450             -2.366006   

       ch_0_peak_to_peak  ch_0_delta_peak_to_peak  ch_0_delta2_peak_to_peak  \
0             258.957642               124.947021                173.354980   
1             237.943115             

In [23]:
print(X_train)
print(y_train)

print(df_val)

       ch_0_skewness  ch_0_delta_skewness  ch_0_delta2_skewness  \
0          -0.623373            -0.227471             -2.042072   
1          -0.230622            -0.480491             -3.000496   
2          -0.529573            -0.093873             -3.163419   
3          -0.120593            -0.051202             -0.527124   
4          -0.343849            -0.196993             -0.712003   
...              ...                  ...                   ...   
14875      -0.018994             0.009518             -2.368109   
14876       0.002107             0.163481             -2.699552   
14877       0.104941            -0.073037             -2.366628   
14878      -0.086292            -0.060643             -3.405448   
14879      -0.236192            -0.103450             -2.366006   

       ch_0_peak_to_peak  ch_0_delta_peak_to_peak  ch_0_delta2_peak_to_peak  \
0             258.957642               124.947021                173.354980   
1             237.943115             

# Applying PCA in order to remove potentially correlated features

In [24]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from joblib import dump

apply_PCA = False
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

if apply_PCA:
    pca = PCA(n_components = 0.99)
    X_train_final = pca.fit_transform(X_train_scaled)
    print("Remaining features : {}".format(X_train_final.shape[1]))
    print("Explained variance ratio : {}".format(sum(pca.explained_variance_ratio_)))
else:
    pca = None
    X_train_final = X_train_scaled
    

<h1> Training - Random Forest and ExtraTrees </h1>

In [25]:
# training
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

params = dict(
            n_estimators=150,
            class_weight='balanced',
            max_depth=8,
            bootstrap=True,
            oob_score=True,
            n_jobs=-1
        )

rdf = RandomForestClassifier(**params)
xt_trees = ExtraTreesClassifier(**params)

print('Fitting...')
rdf.fit(X_train_final, y_train)
xt_trees.fit(X_train_final, y_train)

print("OOB Score - Random Forest: {}".format(rdf.oob_score_))
print("OOB Score - Extra Trees: {}".format(xt_trees.oob_score_))






Fitting...
OOB Score - Random Forest: 0.8460349462365592
OOB Score - Extra Trees: 0.7747983870967742


In [26]:
from evaluation import evaluate_from_df

evaluate_from_df(rdf, scaler, pca, df_val, agg_method='max_proba')
evaluate_from_df(xt_trees, scaler, pca, df_val, agg_method='max_proba')

evaluate_from_df(rdf, scaler, pca, df_val, agg_method='avg_proba')
evaluate_from_df(xt_trees, scaler, pca, df_val, agg_method='avg_proba')

[[62  9]
 [ 3 31]]
              precision    recall  f1-score   support

           0       0.95      0.87      0.91        71
           1       0.78      0.91      0.84        34

    accuracy                           0.89       105
   macro avg       0.86      0.89      0.87       105
weighted avg       0.90      0.89      0.89       105

ROC AUC Score : 0.8925020712510355
[[56 15]
 [ 5 29]]
              precision    recall  f1-score   support

           0       0.92      0.79      0.85        71
           1       0.66      0.85      0.74        34

    accuracy                           0.81       105
   macro avg       0.79      0.82      0.80       105
weighted avg       0.83      0.81      0.81       105

ROC AUC Score : 0.8208367854183928
[[60 11]
 [ 1 33]]
              precision    recall  f1-score   support

           0       0.98      0.85      0.91        71
           1       0.75      0.97      0.85        34

    accuracy                           0.89       105
 

# XGBoost

In [31]:
params = dict(
        class_weight='balanced',
        max_depth=9,
        subsample=0.5,
        n_jobs=-1
    )


xgb = XGBClassifier(**params)
xgb.fit(X_train_final, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', class_weight='balanced',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.5, verbosity=1)

In [33]:
evaluate_from_df(xgb, scaler, pca, df_val, agg_method='max_proba')
evaluate_from_df(xgb, scaler, pca, df_val, agg_method='avg_proba')

[[69  2]
 [ 6 28]]
              precision    recall  f1-score   support

           0       0.92      0.97      0.95        71
           1       0.93      0.82      0.87        34

    accuracy                           0.92       105
   macro avg       0.93      0.90      0.91       105
weighted avg       0.92      0.92      0.92       105

ROC AUC Score : 0.8976801988400994
[[69  2]
 [ 3 31]]
              precision    recall  f1-score   support

           0       0.96      0.97      0.97        71
           1       0.94      0.91      0.93        34

    accuracy                           0.95       105
   macro avg       0.95      0.94      0.95       105
weighted avg       0.95      0.95      0.95       105

ROC AUC Score : 0.941797845898923


# Logistic Regression



In [29]:
lr_clf = LogisticRegression(class_weight='balanced')
lr_clf2 = LogisticRegression(class_weight={0: 0.2, 1:0.8})
lr_clf3 = LogisticRegression(class_weight={0: 0.1, 1:0.9})

lr_clf.fit(X_train_final, y_train)
lr_clf2.fit(X_train_final, y_train)
lr_clf3.fit(X_train_final, y_train)



LogisticRegression(C=1.0, class_weight={0: 0.1, 1: 0.9}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
from evaluation import evaluate_from_df

evaluate_from_df(lr_clf, scaler, pca, df_val, agg_method='max_proba')
evaluate_from_df(lr_clf2, scaler, pca, df_val, agg_method='max_proba')
evaluate_from_df(lr_clf3, scaler, pca, df_val, agg_method='max_proba')

evaluate_from_df(lr_clf, scaler, pca, df_val, agg_method='avg_proba')
evaluate_from_df(lr_clf2, scaler, pca, df_val, agg_method='avg_proba')
evaluate_from_df(lr_clf3, scaler, pca, df_val, agg_method='avg_proba')

[[64  7]
 [ 2 32]]
              precision    recall  f1-score   support

           0       0.97      0.90      0.93        71
           1       0.82      0.94      0.88        34

    accuracy                           0.91       105
   macro avg       0.90      0.92      0.91       105
weighted avg       0.92      0.91      0.92       105

ROC AUC Score : 0.9212924606462304
[[60 11]
 [ 2 32]]
              precision    recall  f1-score   support

           0       0.97      0.85      0.90        71
           1       0.74      0.94      0.83        34

    accuracy                           0.88       105
   macro avg       0.86      0.89      0.87       105
weighted avg       0.90      0.88      0.88       105

ROC AUC Score : 0.8931234465617233
[[56 15]
 [ 2 32]]
              precision    recall  f1-score   support

           0       0.97      0.79      0.87        71
           1       0.68      0.94      0.79        34

    accuracy                           0.84       105
 