# Next steps



# <font color = "\#8FBC8F">**Initialization**


In [None]:
#@title <font color="\#8FBC8F">Google Drive mount

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title <font color="\#8FBC8F">Imports
import os
import pdb
from datetime import datetime as dt

from tqdm import tqdm
from pprint import pprint as pp

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split as Split

print('[imports successfully loaded]')

[imports successfully loaded]


In [None]:

#@title <font color="\#8FBC8F">File loading utility code
DATA_DIR = r'/content/drive/MyDrive/Colab Notebooks/Project Domino/new Macros/'
subject_list = sorted([f for f in os.listdir(DATA_DIR) if 'sub' in f])

subject_index = 0#@param 

subject_path = DATA_DIR + f'{subject_list[subject_index]}/'
subject_files = os.listdir(subject_path)

if 'sub' not in locals() or sub != subject_list[subject_index]:
    sub = subject_list[subject_index] 

ch_name_file = DATA_DIR+f'/{sub}/{sub}_channel_names.npy'
channel_names = np.load(ch_name_file)

print(f'[Working on {subject_list[subject_index]}]')

[Working on sub-015]



---

# <font color = "\#8FBC8F">**Loading Data**</font> 





In [None]:
#@title <font color='darkgreen'>Unpickle X and y 
flm_path = '/content/drive/My Drive/Colab Notebooks/Project Domino/Feature-Label matrices/'
sub_folder_names = os.listdir(flm_path + sub + '/')

last_date = dt.strptime(sub_folder_names[0], '%Y-%m-%d_%H:%M:%S')
if len(sub_folder_names) > 1: 
    for date in sub_folder_names:
        if dt.strptime(date, '%Y-%m-%d_%H:%M:%S')>last_date: last_date = date
last_date = str(last_date).replace(' ', '_')


pickle_path = f'{flm_path}{sub}/{last_date}'

_X = pd.read_pickle(pickle_path +'/X.pickle')
_y = pd.read_pickle(pickle_path +'/y.pickle')

__X, y = _X, _y

print('[Successfully loaded X and y]')


[Successfully loaded X and y]


In [None]:
#@title Function and dictionaries declaration

from sklearn.preprocessing import StandardScaler as NRM

from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import plot_confusion_matrix, confusion_matrix

from sklearn.metrics import classification_report as CR

global seed

def split_norm(X):
    X_train, X_test, y_train, y_test = Split(X, y, test_size=0.2, random_state=seed)
    
    nrm = NRM().fit(X_train)
    X_train = pd.DataFrame(nrm.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(nrm.transform(X_test), columns=X.columns)
    return X_train, X_test, y_train, y_test

def train_models(X_train, X_test, y_train, y_test):
    svc = LinearSVC().fit(X_train, y_train.to_numpy().ravel())
    knn = KNeighborsClassifier().fit(X_train, y_train.to_numpy().ravel())
    lr = LogisticRegressionCV().fit(X_train, y_train.to_numpy().ravel())
    gb = GradientBoostingClassifier().fit(X_train, y_train.to_numpy().ravel())
    return svc, knn, lr, gb
    # return [svc]
    # return [knn]
    # return [lr]
    # return [gb]



def plot_conf_mxs(models, X_test, y_test, plot=False):
    for mdl in models:
        mdl_name = mdl.__str__().split('(')[0]
        
        if plot:
            disp = plot_confusion_matrix(mdl, X_test, y_test)
            disp.figure_.set_size_inches([3,3])
            disp.ax_.set_title(mdl_name)
            cnf = disp.confusion_matrix
        else:
            cnf = confusion_matrix(y_test,mdl.predict(X_test))

        return (channel_names[channel_number], mdl_name, cnf)

    # return confusion_matrices



def report_models(models, X_test, y_test, prnt=True):
    reports = []
    for mdl in models:
        mdl_name = mdl.__str__().split('(')[0]
        cr = CR(y_test,mdl.predict(X_test),output_dict=True)
        if prnt:
            print(f'{mdl_name} - classification report '+'*'*(60-len(mdl_name))+'\n')
            print(cr)
            print('*'*(60+len(' - classification report '))+'\n')
        reports.append((channel_names[channel_number], mdl_name, cr['macro avg']))
    return reports

# Determinism check

In [None]:
# select data for model creation
channel_number =  6#@param {type:'integer'}
X = __X.xs(key=channel_number, axis=1, level='Channel_Number')
X_train, X_test, y_train, y_test = split_norm(X)    

# iterate over the data
for i in range(5):
    print(f'Run number {i+1} :')
    models = train_models(X_train, X_test, y_train, y_test)
    # plot_conf_mxs(models, X_test, y_test, plot=True)
    report_models(models, X_test, y_test, prnt=True)

# <font color='cyan'>Run analysis

In [None]:
#@title <font color='cyan'>Main processing loop
N_RUNS_PER_MODEL =  20#@param {type:'integer'}

seeds = np.random.randint(0,10000,N_RUNS_PER_MODEL)

global confusion_matrices
confusion_matrices = {}

global classification_reports
classification_reports = {}

for channel_number in tqdm(range(__X.columns[-1][1]), desc='Electrode'):
    X = __X.xs(key=channel_number, axis=1, level='Channel_Number')
    
    cm, cr = [], []
    for i in range(N_RUNS_PER_MODEL):
        seed = seeds[i]
        X_train, X_test, y_train, y_test = split_norm(X)
        models = train_models(X_train, X_test, y_train, y_test)
        # cm.append(plot_conf_mxs(models, X_test, y_test, plot=False))
        cr.append(report_models(models, X_test, y_test, prnt=False))
    


    reports_data = {}
    
    for _r in cr:
        for modl in _r:
            k = modl[0:2]; rep = modl[2]
        
            if not k in reports_data.keys(): 
                reports_data[k]=[]
        
            reports_data[k].append(rep)
    reports_data


    for k,v in reports_data.items():
        prec, rec ,f1 = [],[],[]
        sup=0

        for rep in v:
            prec.append(rep['precision'])
            rec.append(rep['recall'])
            f1.append(rep['f1-score'])
            sup += rep['support']

        metrics = np.array(prec), np.array(rec) ,np.array(f1)
        metric_names = ['precision', 'recall', 'f1-score']
        report = {}

        for n, m in zip(metric_names, metrics):
            report[n + '_mean'] = m.mean()
            report[n + '_std'] = m.std()
        report['support'] = sup

        classification_reports[k] = report
        # confusion_matrices[k] = cm


Electrode: 100%|██████████| 46/46 [56:06<00:00, 73.19s/it]


In [None]:
#@title
# classification_report

In [None]:
#@title
#-@title <font color='cyan'>Confusion matrices
# pp(confusion_matrices)


In [None]:
#@title <font color='cyan'>Classification reports
sorted_classification_reports = {}
f1s = []; keys = []

for k,v in classification_reports.items():
    avg_f1_mean = v['f1-score_mean']; avg_f1_std = v['f1-score_std']
    f1s.append(avg_f1_mean); keys.append(k)
    sorted_classification_reports[(*k,avg_f1_mean, avg_f1_std)] = v

# keys[np.argmax(f1s)]

In [None]:
#@title <font color='cyan'>Best 20 Models (Avg F1-Score)
import collections

_scr = sorted(sorted_classification_reports.keys(), 
             key=lambda item: item[2],reverse=True)[:20]


scr = [(cr_data[0], cr_data[1], cr_data[2], cr_data[3]) for cr_data in _scr]

for i,cr in enumerate(scr):
    print(f'{i+1}.\nChannel name: {cr[0]}\tClassifier Name: {cr[1]}')
    print(f'F1-Score Mean: {cr[2]:.3f}\tstd: {cr[3]:.3f}')
    # print('Confusion Matrix:')
    # print(confusion_matrices[cr[0:2]])
# pp(scr)

1.
Channel name: LHC_4-3	Classifier Name: GradientBoostingClassifier
F1-Score Mean: 0.590	std: 0.105
2.
Channel name: ROF_7-6	Classifier Name: LinearSVC
F1-Score Mean: 0.587	std: 0.114
3.
Channel name: ROF_7-6	Classifier Name: GradientBoostingClassifier
F1-Score Mean: 0.582	std: 0.108
4.
Channel name: ROF_6-5	Classifier Name: LinearSVC
F1-Score Mean: 0.581	std: 0.094
5.
Channel name: RASF_6-5	Classifier Name: LinearSVC
F1-Score Mean: 0.575	std: 0.102
6.
Channel name: RMF_3-2	Classifier Name: LinearSVC
F1-Score Mean: 0.572	std: 0.078
7.
Channel name: RHC_8-7	Classifier Name: LinearSVC
F1-Score Mean: 0.572	std: 0.094
8.
Channel name: ROF_8-7	Classifier Name: GradientBoostingClassifier
F1-Score Mean: 0.568	std: 0.104
9.
Channel name: ROPR_8-7	Classifier Name: GradientBoostingClassifier
F1-Score Mean: 0.567	std: 0.120
10.
Channel name: LHC_5-4	Classifier Name: LinearSVC
F1-Score Mean: 0.566	std: 0.084
11.
Channel name: ROF_6-5	Classifier Name: LogisticRegressionCV
F1-Score Mean: 0.562	std: