# <font color = "\#8FBC8F">**Initialization**


In [2]:
#@title <font color="\#8FBC8F">Google Drive mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#@title <font color="\#8FBC8F">Imports
import os
import pdb
from datetime import datetime as dt

from tqdm import tqdm
from pprint import pprint as pp

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import csv
import pickle

import warnings
warnings.filterwarnings("ignore")

print('[imports successfully loaded]')

[imports successfully loaded]


In [4]:
#@title <font color="\#8FBC8F">ML Imports 
# from sklearn.preprocessing import StandardScaler as NRM

from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, roc_curve, roc_auc_score, f1_score, precision_recall_curve
from sklearn.metrics import classification_report as CR

# from scipy.stats import uniform, randint

# <font color=cyan>Subject number

In [5]:

#@title <font color="\#8FBC8F">File loading utility code
DATA_DIR = r'/content/drive/MyDrive/Colab Notebooks/Project Domino/new Macros/'
subject_list = sorted([f for f in os.listdir(DATA_DIR) if 'sub' in f])

subject_index = 8#@param {type:'integer'} 

subject_path = DATA_DIR + f'{subject_list[subject_index]}/'
subject_files = os.listdir(subject_path)

if 'sub' not in locals() or sub != subject_list[subject_index]:
    sub = subject_list[subject_index] 

ch_name_file = DATA_DIR+f'/{sub}/{sub}_channel_names.npy'
channel_names = np.load(ch_name_file)

print(f'[Working on {subject_list[subject_index]}]')


[Working on sub-023]





# <font color = "\#8FBC8F">**Loading Data**</font> 





In [6]:
#@title <font color='darkgreen'>Unpickle X and y - Utility
pickle_path = f'/content/drive/MyDrive/Colab Notebooks/Project Domino/Feature-Label matrices/{sub}/'
f_last = pickle_path + [f for f in os.listdir(pickle_path) if 'last' in f][0]

with open(f_last, 'r') as f: last_date = f.read()
results_path = pickle_path + last_date

if not 'best params' in os.listdir(results_path): 
    os.mkdir(results_path + '/best params')
    print('[\"best params\" Folder Created]')
else: print('[\"best params\" Folder Exists]')

_X = pd.read_pickle(results_path +'/X.pickle')
_y = pd.read_pickle(results_path +'/y.pickle')
X, y = _X, _y

print('[Data loaded successfully]')


["best params" Folder Exists]
[Data loaded successfully]


In [7]:
#@title Function and dictionaries declaration
global seed

def split_norm(X):
    X_train, X_test, y_train, y_test = Split(X, y, test_size=0.2, random_state=seed)
    
    nrm = NRM().fit(X_train)
    X_train = pd.DataFrame(nrm.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(nrm.transform(X_test), columns=X.columns)
    return X_train, X_test, y_train, y_test

def train_models(X_train, X_test, y_train, y_test):
    svc = LinearSVC().fit(X_train, y_train.to_numpy().ravel())
    knn = KNeighborsClassifier().fit(X_train, y_train.to_numpy().ravel())
    lr = LogisticRegressionCV().fit(X_train, y_train.to_numpy().ravel())
    gb = GradientBoostingClassifier().fit(X_train, y_train.to_numpy().ravel())
    return svc, knn, lr, gb
    # return [svc]
    # return [knn]
    # return [lr]
    # return [gb]



def plot_conf_mxs(models, X_test, y_test, plot=False):
    for mdl in models:
        mdl_name = mdl.__str__().split('(')[0]
        
        if plot:
            disp = plot_confusion_matrix(mdl, X_test, y_test)
            disp.figure_.set_size_inches([3,3])
            disp.ax_.set_title(mdl_name)
            cnf = disp.confusion_matrix
        else:
            cnf = confusion_matrix(y_test,mdl.predict(X_test))

        return (channel_names[channel_number], mdl_name, cnf)

    # return confusion_matrices



def report_models(models, X_test, y_test, prnt=True):
    reports = []
    for mdl in models:
        mdl_name = mdl.__str__().split('(')[0]
        cr = CR(y_test,mdl.predict(X_test),output_dict=True)
        if prnt:
            print(f'{mdl_name} - classification report '+'*'*(60-len(mdl_name))+'\n')
            print(cr)
            print('*'*(60+len(' - classification report '))+'\n')
        reports.append((channel_names[channel_number], mdl_name, cr['macro avg']))
    return reports

In [8]:
split = False #@param {type:'boolean'}
test_size = 0.2 #@param
if split:
    X_train, X_test, y_train, y_test = train_test_split(_X, _y, test_size=test_size)
else: X_train, y_train = _X, _y

# Choose Model Type for RandomSearchCV

In [9]:
model_type = 'SVC'#@param ['XGBoost', 'DecisionTree', 'LogisticRegression', 'SVC']

# RandomizedSearch

## Model &Distribution Declarations

In [10]:
#@title XGBoost model & distribution declaration
# - <font color=Blue>Done

if model_type == 'XGBoost': 
    distribution = {
                    'max_depth': [1,2,3,4,5],
                    'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0],
                    'n_estimators': [100, 250, 500, 750, 1_000, 1_250, 1_500],
                    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
                    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
                    'colsample_bynode': [0.6, 0.7, 0.8, 0.9, 1.0],
                    'reg_alpha': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
                    }
    model = xgb.XGBClassifier() 
    factor = 1 #@param

In [11]:
#@title DecisionTree model & distribution declaration
# - <font color=Blue>Done 
if model_type == 'DecisionTree': 
    distribution = {
                    'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
                    'min_samples_split': [1,2,4,6,8,10],
                    'min_samples_leaf': [1,2,3,4],
                    }

    model = DecisionTreeClassifier()
    factor = 10 #@param

In [12]:
#@title LogisticRegression model & distribution declaration
# - <font color=Blue>Done
if model_type == 'LogisticRegression': 
    distribution = {
                    'tol': [1e-3, 5e-4, 2e-4, 1e-4, 5e-5, 2e-5],
                    'C': [0.1, 0.2, 0.5, 1, 2, 5, 10],
                    'max_iter': [100, 200, 400, 750, 1000],
                    'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1],                
                    }
    model = LogisticRegression(penalty='elasticnet', solver='saga')
    factor = 1 #@param

In [13]:
#@title SVC model & distribution declaration
# - <font color=Blue>Done
if model_type == 'SVC': 
    distribution = {
                    'tol': [5e-3, 2e-3, 1e-3, 5e-4, 2e-4, 1e-4, 5e-5, 2e-5, 1e-5],
                    'C': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10],
                    'max_iter': [100, 200, 500, 1000, 2000],
                    
                    }
    model = LinearSVC() 
    factor =  10#@param

## RandomizedSearchCV

In [14]:
#@title 
print(f'running -{model_type}- RandomSearchCV')

running -SVC- RandomSearchCV


In [15]:
#@title RandSearchCV declaration
rscv = RandomizedSearchCV(model, distribution, n_iter=100*factor,
                          scoring='f1', verbose=4)

best_estimator_params = {}

In [None]:
#@title Train
for channel_number in tqdm(range(X_train.columns[-1][1])):
    X = X_train.xs(key=channel_number, axis=1, level='Channel_Number')
    rscv.fit(X, y_train)

    param = rscv.best_params_
    score = rscv.best_score_

    print("*"*80)
    print(param)
    print("*"*80)

    if channel_number not in best_estimator_params.keys(): 
        best_estimator_params[channel_number] = {}

    for k,v in param.items():
        best_estimator_params[channel_number][k] = v
    best_estimator_params[channel_number]['f1-score'] = score

In [None]:
# best_estimator_params

## Summary utility

In [None]:
f1_by_ch_name = { channel_names[k]:v['f1-score'] for k,v in best_estimator_params.items()}
f1_by_ch_number = { k:v['f1-score'] for k,v in best_estimator_params.items()}
model_name = str(type(model)).strip("\'<>").split('.')[-1]

In [None]:
srt_ch = np.array(sorted(f1_by_ch_number.items(), key = lambda item:item[1], reverse=True))

summary = [[k,*list( best_estimator_params[k].values())] for k in srt_ch[:, 0].tolist()]
header = ['ch_number', *list(best_estimator_params[0].keys())]



In [None]:
# summary

In [None]:
#@title plot f1 per param
# f1_scores = [v['f1-score'] for k,v in best_estimator_params.items()]

# csmpl_n, csmpl_t, lr, m_dpt, n_est, alhp, ssmpl = [], [], [], [], [], [], []
# for d in list(best_estimator_params.values()):

#     csmpl_n.append((d['colsample_bynode'], d['f1-score']))
#     csmpl_t.append((d['colsample_bytree'], d['f1-score']))
#     lr.append((d['learning_rate'], d['f1-score']))
#     m_dpt.append((d['max_depth'], d['f1-score']))
#     n_est.append((d['n_estimators'], d['f1-score']))
#     alhp.append((d['reg_alpha'], d['f1-score']))
#     ssmpl.append((d['subsample'], d['f1-score']))

# th = 0.6 #@param

# def f1_per_param(param=None, name=''):
#     '''
#     param is a list of tuples := (param_value, f1_score)
#     name is str
#     '''
#     param = np.array(sorted(param, key=lambda item: item[0]))

#     f1_sums = {p[0]:0 for p in param}
#     for p in param: 
#         if p[1] > th: f1_sums[p[0]] += p[1] 
    
    
#     plt.figure()
#     # plt.plot(param[:, 0], param[:, 1])
#     # plt.axhline(th, color='r')
#     # plt.title(f'{name}\'s f1-score per parameter value')

#     # plt.hist(param[:, 0])
#     # plt.title(f'{name}\'s parameter histogram')

#     plt.plot(list(f1_sums.keys()), list(f1_sums.values()))
#     plt.title(f'{name}\'s cumulative f1-score plot')


# f1_per_param(csmpl_n, 'colsample_bynode'), f1_per_param(csmpl_t, 'colsample_bytree')
# f1_per_param(lr, 'learning_rate'), f1_per_param(m_dpt, 'max_depth')
# f1_per_param(n_est, 'n_estimators'), f1_per_param(alhp, 'reg_alpha')
# f1_per_param(ssmpl, 'subsample')


## CSV and Pickling


In [None]:
#@title save parms and f1 scores in csv format
with open(results_path + f"/best params/{model_name}_summary.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(summary)

In [None]:
#@title Pickle best estimator params
filename = results_path + f'/best params/best_{model_name}_params.pickle'

with open(filename, 'wb') as handle:
    pickle.dump(best_estimator_params, handle, protocol=pickle.HIGHEST_PROTOCOL)

# <font color=black>Old Code

## Model Exmination

In [None]:
# model = xgb.XGBClassifier(
#     colsample_bynode = 0.6,
#     colsample_bytree = 0.9,
#     learning_rate = 1,
#     max_depth = 5,
#     n_estimators = 100,
#     reg_alpha = 0.25,
#     subsample = 0.7,
# )


In [None]:
#@title Train
# ch_f1_scores = {}

# for channel_number in range(X_train.columns[-1][1]):
#     X = X_train.xs(key=channel_number, axis=1, level='Channel_Number')
#     model.fit(X, y_train)

#     cr = CR(y_train, model.predict(X), output_dict=1)
#     cvs = cross_val_score(model, X, y_train, scoring='f1')
#     mean_cvs = np.mean(cvs)
    
#     ch_f1_scores[channel_number] = mean_cvs
#     print(f'avg cross validation f1-score for channel {channel_number}: {mean_cvs:.3f}')
#     # pdb.set_trace()


In [None]:
# import pickle
# filename = pickle_path + last_date + '/best_estimator_params_2.pickle'

# with open(filename, 'rb') as handle:
#     best_estimator_params = pickle.load(handle) # , protocol=pickle.HIGHEST_PROTOCOL

# f1_by_ch_number = { k:v['f1-score'] for k,v in best_estimator_params.items()}

In [None]:
# srt_ch_idx = np.argsort(list(f1_by_ch_number.values()))[::-1]
# f1s = np.array(list(f1_by_ch_number.values()))
# print(f1s[srt_ch_idx])

In [None]:
# import csv
# with open(pickle_path + last_date +"/special param f1-scores per channel name.csv", "w", newline="") as f:
#     writer = csv.writer(f)
#     writer.writerow(('ch_num','f1-score'))
#     for i in srt_ch_idx:
#         writer.writerow((channel_names[i],np.array(list(ch_f1_scores.values()))[i]))

    

## GridSearch

## GridSearchCV

In [None]:
#@title Model declaration

# model = xgb.XGBClassifier(
#     # max_depth=2,          # with cumulative f1-score of 15.705871905871904
#     # learning_rate=0.5,    # with cumulative f1-score of 8.81094128094128
#     n_estimators=1_000,     # with cumulative f1-score of 
#     subsample=0.8,          # with cumulative f1-score of 
# colsample_bytree=0.6,       # with cumulative f1-score of 
#     colsample_bynode=0.6,   # with cumulative f1-score of 
#     reg_alpha=0.1,          # with cumulative f1-score of 
#     scale_pos_weight=1      # with cumulative f1-score of 
#     )

# cv_params = {
#              'max_depth': [1, 2, 3, 4],
#              'learning_rate': [0.2, 0.35, 0.5, 0.75, 1.0],
#             #  'n_estimators': [100, 500, 1_000, 2_000, 5_000]
#             #  'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
#             #  'colsample_bytree': [0.2, 0.4, 0.6, 0.8, 1.0],
#             #  'colsample_bynode': [0.2, 0.4, 0.6, 0.8, 1.0],
#             #  'reg_alpha': [0.01, 0.05, 0.1, 0.2, 0.5],
#             }
  

# csv = GridSearchCV(model, cv_params,
#                    scoring='f1', cv=5, verbose=4)

# best_estimator_params = {}

# # model.fit(_X, _y)

In [None]:
#@title Train model
# for channel_number in range(X_train.columns[-1][1]):
#     X = X_train.xs(key=channel_number, axis=1, level='Channel_Number')
#     csv.fit(X, y_train)

#     param = csv.best_params_
#     score = csv.best_score_

#     print("*"*80)
#     print(param)
#     print("*"*80)

#     if channel_number not in best_estimator_params.keys(): 
#         best_estimator_params[channel_number] = {}

#     for k,v in param.items():
#         best_estimator_params[channel_number][k] = v
#     best_estimator_params[channel_number]['f1-score'] = score

In [None]:
# best_estimator_params

## GridSearchCV Results

In [None]:
# dpt, lrs = [], []
# for d in list(best_estimator_params.values()):
#     lrs.append((d['learning_rate'], d['f1-score']))
#     dpt.append((d['max_depth'], d['f1-score']))

# def f1_per_param(param):
#     '''
#     param is a list of tuples := (param_value, f1_score)
#     '''
#     param = np.array(param)
#     param_results = {k:0 for k in np.unique(param[:,0])}
    
#     for t in param: param_results[t[0]]  += t[1]

#     counts = pd.value_counts(np.array(param)[:,0])
#     print('Value Counts:')
#     _ = [pp((k,v)) for k, v in pd.value_counts(np.array(param)[:,0]).items()]

#     print('\nCumulative f1-score:')
#     _ = [pp((k, v)) for k, v in sorted(param_results.items(), key=lambda item: item[1], reverse=True)]

#     print('\nNormalized f1-score:')
#     _ = [pp((k, v/counts[k])) for k, v in sorted(param_results.items(), key=lambda item: item[1], reverse=True)]
    

# # f1_per_param(dpt)
# f1_per_param(lrs)


##  <font color=black>Deprecated models declaraations

In [None]:
#@title GradientBoosting model & distribution declaration
# distribution = {
#                 'max_depth': [1,2,3,4,5],
#                 'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0],
#                 'n_estimators': [100, 250, 500, 750, 1_000, 1_250, 1_500],
#                 'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
#                 'min_samples_split': [1,2,4,6,8,10],
#                 'min_samples_leaf': [1,2,3,4],
#                 }
# model = GradientBoostingClassifier()
# factor = 1 #@param

In [None]:
#@title KNN model & distribution declaration
# distribution = {
#                 'n_neighbors': [1,3,5,7,9],
#                 'leaf_size': [1,2,4,6,8],
#                 'p': [1,2],
#                 }
# model = KNeighborsClassifier()
# factor =  50#@param