In [19]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import LocallyLinearEmbedding
from xgboost import XGBClassifier, plot_importance
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

patient_id = 3

val_csv_path = "D:\\Faculdade\\TCC\\dados\\epilepsy_ecosystem\\extracted_features_val.csv"

df_val = pd.read_csv(val_csv_path)

if patient_id:
    train_csv_path = "D:\\Faculdade\\TCC\\dados\\epilepsy_ecosystem\\selected_features_Pat{}Train.csv".format(patient_id)
    df_val = df_val.loc[df_val['patient'] == patient_id]
else:
    train_csv_path = "D:\\Faculdade\\TCC\\dados\\epilepsy_ecosystem\\selected_features.csv"

df_train = pd.read_csv(train_csv_path)
df_train = df_train[df_train.columns[1:]]

df_val = df_val[df_train.columns] # ignoring index column

print(df_train, df_val)
df_train = df_train.drop(columns=['patient','segment_id'])


X_train = df_train.drop(columns=['class'])
y_train = df_train['class']


       ch_0_delta2_skewness  ch_0_peak_to_peak  ch_0_delta2_peak_to_peak  \
0                 -5.493382         289.909180                189.065430   
1                 -5.078931         236.936035                194.974365   
2                 -4.426151         365.978516                242.039062   
3                 -3.405505         268.961670                247.947266   
4                 -4.140038         278.965454                240.897705   
...                     ...                ...                       ...   
48775             -3.642526         355.907593                238.010742   
48776             -2.930309         532.954956                214.310303   
48777             -3.764823         355.907471                193.228516   
48778             -3.124127         610.971191                254.056885   
48779             -2.902852         797.955322                248.014404   

       ch_0_kurtosis   ch_0_rms  ch_0_delta_rms  ch_0_delta2_rms  \
0           0.74800

In [20]:
print(X_train)
print(y_train)

print(df_val)

       ch_0_delta2_skewness  ch_0_peak_to_peak  ch_0_delta2_peak_to_peak  \
0                 -5.493382         289.909180                189.065430   
1                 -5.078931         236.936035                194.974365   
2                 -4.426151         365.978516                242.039062   
3                 -3.405505         268.961670                247.947266   
4                 -4.140038         278.965454                240.897705   
...                     ...                ...                       ...   
48775             -3.642526         355.907593                238.010742   
48776             -2.930309         532.954956                214.310303   
48777             -3.764823         355.907471                193.228516   
48778             -3.124127         610.971191                254.056885   
48779             -2.902852         797.955322                248.014404   

       ch_0_kurtosis   ch_0_rms  ch_0_delta_rms  ch_0_delta2_rms  \
0           0.74800

# Applying PCA in order to remove potentially correlated features

In [21]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from joblib import dump

apply_PCA = False
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

if apply_PCA:
    pca = PCA(n_components = 0.99)
    X_train_final = pca.fit_transform(X_train_scaled)
    print("Remaining features : {}".format(X_train_final.shape[1]))
    print("Explained variance ratio : {}".format(sum(pca.explained_variance_ratio_)))
else:
    pca = None
    X_train_final = X_train_scaled
    

<h1> Training - Random Forest and ExtraTrees </h1>

In [22]:
# training
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

params = dict(
            n_estimators=150,
            class_weight='balanced',
            max_depth=10,
            bootstrap=True,
            oob_score=True,
            n_jobs=-1
        )

rdf = RandomForestClassifier(**params)
xt_trees = ExtraTreesClassifier(**params)

print('Fitting...')
rdf.fit(X_train_final, y_train)
xt_trees.fit(X_train_final, y_train)

print("OOB Score - Random Forest: {}".format(rdf.oob_score_))
print("OOB Score - Extra Trees: {}".format(xt_trees.oob_score_))






Fitting...
OOB Score - Random Forest: 0.866379663796638
OOB Score - Extra Trees: 0.7886223862238623


In [23]:
from evaluation import evaluate_from_df

evaluate_from_df(rdf, scaler, pca, df_val, agg_method='max_proba')
evaluate_from_df(xt_trees, scaler, pca, df_val, agg_method='max_proba')

evaluate_from_df(rdf, scaler, pca, df_val, agg_method='avg_proba')
evaluate_from_df(xt_trees, scaler, pca, df_val, agg_method='avg_proba')

[[338  27]
 [ 17  33]]
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       365
           1       0.55      0.66      0.60        50

    accuracy                           0.89       415
   macro avg       0.75      0.79      0.77       415
weighted avg       0.90      0.89      0.90       415

ROC AUC Score : 0.793013698630137
[[306  59]
 [ 14  36]]
              precision    recall  f1-score   support

           0       0.96      0.84      0.89       365
           1       0.38      0.72      0.50        50

    accuracy                           0.82       415
   macro avg       0.67      0.78      0.69       415
weighted avg       0.89      0.82      0.85       415

ROC AUC Score : 0.7791780821917808
[[337  28]
 [ 15  35]]
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       365
           1       0.56      0.70      0.62        50

    accuracy                           0.90 

# XGBoost

In [24]:
params = dict(
        class_weight='balanced',
        max_depth=10,
        subsample=0.5,
        n_jobs=-1
    )


xgb = XGBClassifier(**params)
xgb.fit(X_train_final, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', class_weight='balanced',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.5, verbosity=1)

In [25]:
evaluate_from_df(xgb, scaler, pca, df_val, agg_method='max_proba')
evaluate_from_df(xgb, scaler, pca, df_val, agg_method='avg_proba')

[[365   0]
 [ 30  20]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       365
           1       1.00      0.40      0.57        50

    accuracy                           0.93       415
   macro avg       0.96      0.70      0.77       415
weighted avg       0.93      0.93      0.91       415

ROC AUC Score : 0.7
[[365   0]
 [ 29  21]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       365
           1       1.00      0.42      0.59        50

    accuracy                           0.93       415
   macro avg       0.96      0.71      0.78       415
weighted avg       0.94      0.93      0.92       415

ROC AUC Score : 0.71


# Logistic Regression



In [26]:
lr_clf = LogisticRegression(class_weight='balanced')
lr_clf2 = LogisticRegression(class_weight={0: 0.2, 1:0.8})
lr_clf3 = LogisticRegression(class_weight={0: 0.1, 1:0.9})

lr_clf.fit(X_train_final, y_train)
lr_clf2.fit(X_train_final, y_train)
lr_clf3.fit(X_train_final, y_train)



LogisticRegression(C=1.0, class_weight={0: 0.1, 1: 0.9}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
from evaluation import evaluate_from_df

evaluate_from_df(lr_clf, scaler, pca, df_val, agg_method='max_proba')
evaluate_from_df(lr_clf2, scaler, pca, df_val, agg_method='max_proba')
evaluate_from_df(lr_clf3, scaler, pca, df_val, agg_method='max_proba')

evaluate_from_df(lr_clf, scaler, pca, df_val, agg_method='avg_proba')
evaluate_from_df(lr_clf2, scaler, pca, df_val, agg_method='avg_proba')
evaluate_from_df(lr_clf3, scaler, pca, df_val, agg_method='avg_proba')

[[320  45]
 [  7  43]]
              precision    recall  f1-score   support

           0       0.98      0.88      0.92       365
           1       0.49      0.86      0.62        50

    accuracy                           0.87       415
   macro avg       0.73      0.87      0.77       415
weighted avg       0.92      0.87      0.89       415

ROC AUC Score : 0.8683561643835614
[[346  19]
 [ 14  36]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.95       365
           1       0.65      0.72      0.69        50

    accuracy                           0.92       415
   macro avg       0.81      0.83      0.82       415
weighted avg       0.92      0.92      0.92       415

ROC AUC Score : 0.8339726027397261
[[313  52]
 [  5  45]]
              precision    recall  f1-score   support

           0       0.98      0.86      0.92       365
           1       0.46      0.90      0.61        50

    accuracy                           0.86