In [1]:
# ! git clone https://github.com/Xtra-Computing/thundersvm.git
# ! cd thundersvm && mkdir build && cd build && cmake .. && make -j
# ! python /content/thundersvm/python/setup.py install

# #-- Building for: Visual Studio 16 2019

In [1]:
from importlib.machinery import SourceFileLoader
thundersvm = SourceFileLoader("thundersvm", r"thundersvm\python\thundersvm\thundersvm.py").load_module()
from thundersvm import SVC

In [2]:
svm = SVC()

In [3]:
import pandas as pd
import numpy as np
import pickle as pickle
import os
import time
from pywt import wavedec
import pyeeg
import scipy.io as sio

from pathlib import Path
cwd = os.getcwd()
parent = Path(cwd).parent

In [4]:
from pywt import Wavelet
from math import floor, ceil
from numpy import concatenate, flipud, zeros, convolve, array

def padding_symmetric(signal, size=8):
    '''
    Applies a symmetric padding of the specified size to the input signal.
    Parameters
    ----------
    signal : ndarray
        The signal to be padded.
    size : int, optional
        The size of the padding which corresponds to the size of the filter. The default is 8.
    Returns
    -------
    padded_signal : ndarray
        Padded signal.
    '''
    
    padded_signal = concatenate([flipud(signal[:size]), signal, flipud(signal[-size:])])
    return padded_signal


def restore_signal(signal, reconstruction_filter, real_len):
    '''
    Restores the signal to its original size using the reconstruction filter.
    Parameters
    ----------
    signal : ndarray
        The signal to be restored.
    reconstruction_filter : list
        The reconstruction filter to be used for restoring the signal.
    real_len : int
        Real length of the signal.
    Returns
    -------
    restored_signal : ndarray
        Restored signal of the specified length.
    '''
    restored_signal = zeros(2 * len(signal) + 1)
    for i in range(len(signal)):
        restored_signal[i*2+1] = signal[i]
    restored_signal = convolve(restored_signal, reconstruction_filter)
    restored_len = len(restored_signal)
    exceed_len = (restored_len - real_len) / 2
    restored_signal = restored_signal[int(floor(exceed_len)):(restored_len - int(ceil(exceed_len)))]
    return restored_signal

def DWTfn(signal, level=3, mother_wavelet='db4'):
    '''
    Applies a Discrete Wavelet Transform to the signal.
    Parameters
    ----------
    signal : ndarray
        The signal on which the DWT will be applied.
    level : int, optional
        The decomposition levels for the DWT. The default is 3.
    mother_wavelet : str, optional
        The mother wavelet that it is going to be used in the DWT. The default is "db4".
    Returns
    -------
    restored_approx_coeff : list
        Restored approximations coefficients.
    restored_detail_coeff : list
        Restored detail coefficients.
    '''
    if type(signal).__name__ != "ndarray" and type(signal) != list:
        raise TypeError(f"'signal' must be 'ndarray', received: '{type(signal).__name__}'")
    if type(signal) == list:
        signal = array(signal)
    if "float" not in signal.dtype.name and "int" not in signal.dtype.name:
        raise TypeError(f"All elements of 'signal' must be numbers")
           
    if type(level) != int:
        raise TypeError(f"'level' must be 'int', received: '{type(level).__name__}'")
    if level < 1:
        raise TypeError(f"'level' must be greater than 0, received: {level}")
        
    if mother_wavelet not in ['haar', 'db1', 'db2', 'db3', 'db4', 'db5', 'db6', 'db7', 'db8', 'db9', 'db10', 'db11', 'db12', 'db13', 'db14', 'db15', 'db16', 'db17', 'db18', 'db19', 'db20', 'db21', 'db22', 'db23', 'db24', 'db25', 'db26', 'db27', 'db28', 'db29', 'db30', 'db31', 'db32', 'db33', 'db34', 'db35', 'db36', 'db37', 'db38', 'sym2', 'sym3', 'sym4', 'sym5', 'sym6', 'sym7', 'sym8', 'sym9', 'sym10', 'sym11', 'sym12', 'sym13', 'sym14', 'sym15', 'sym16', 'sym17', 'sym18', 'sym19', 'sym20', 'coif1', 'coif2', 'coif3', 'coif4', 'coif5', 'coif6', 'coif7', 'coif8', 'coif9', 'coif10', 'coif11', 'coif12', 'coif13', 'coif14', 'coif15', 'coif16', 'coif17', 'bior1.1', 'bior1.3', 'bior1.5', 'bior2.2', 'bior2.4', 'bior2.6', 'bior2.8', 'bior3.1', 'bior3.3', 'bior3.5', 'bior3.7', 'bior3.9', 'bior4.4', 'bior5.5', 'bior6.8', 'rbio1.1', 'rbio1.3', 'rbio1.5', 'rbio2.2', 'rbio2.4', 'rbio2.6', 'rbio2.8', 'rbio3.1', 'rbio3.3', 'rbio3.5', 'rbio3.7', 'rbio3.9', 'rbio4.4', 'rbio5.5', 'rbio6.8', 'dmey', 'gaus1', 'gaus2', 'gaus3', 'gaus4', 'gaus5', 'gaus6', 'gaus7', 'gaus8', 'mexh', 'morl', 'cgau1', 'cgau2', 'cgau3', 'cgau4', 'cgau5', 'cgau6', 'cgau7', 'cgau8', 'shan', 'fbsp', 'cmor']:
        raise TypeError(f"Invalid 'mother_wavelet' must be 'haar', 'db1', 'db2', 'db3', 'db4', 'db5', 'db6', 'db7', 'db8', 'db9', 'db10', 'db11', 'db12', 'db13', 'db14', 'db15', 'db16', 'db17', 'db18', 'db19', 'db20', 'db21', 'db22', 'db23', 'db24', 'db25', 'db26', 'db27', 'db28', 'db29', 'db30', 'db31', 'db32', 'db33', 'db34', 'db35', 'db36', 'db37', 'db38', 'sym2', 'sym3', 'sym4', 'sym5', 'sym6', 'sym7', 'sym8', 'sym9', 'sym10', 'sym11', 'sym12', 'sym13', 'sym14', 'sym15', 'sym16', 'sym17', 'sym18', 'sym19', 'sym20', 'coif1', 'coif2', 'coif3', 'coif4', 'coif5', 'coif6', 'coif7', 'coif8', 'coif9', 'coif10', 'coif11', 'coif12', 'coif13', 'coif14', 'coif15', 'coif16', 'coif17', 'bior1.1', 'bior1.3', 'bior1.5', 'bior2.2', 'bior2.4', 'bior2.6', 'bior2.8', 'bior3.1', 'bior3.3', 'bior3.5', 'bior3.7', 'bior3.9', 'bior4.4', 'bior5.5', 'bior6.8', 'rbio1.1', 'rbio1.3', 'rbio1.5', 'rbio2.2', 'rbio2.4', 'rbio2.6', 'rbio2.8', 'rbio3.1', 'rbio3.3', 'rbio3.5', 'rbio3.7', 'rbio3.9', 'rbio4.4', 'rbio5.5', 'rbio6.8', 'dmey', 'gaus1', 'gaus2', 'gaus3', 'gaus4', 'gaus5', 'gaus6', 'gaus7', 'gaus8', 'mexh', 'morl', 'cgau1', 'cgau2', 'cgau3', 'cgau4', 'cgau5', 'cgau6', 'cgau7', 'cgau8', 'shan', 'fbsp', or 'cmor', received: '{mother_wavelet}'")
        
    original_len = len(signal)
    approx_coeff = []
    detail_coeff = []
    wavelet = pywt.Wavelet(mother_wavelet)
    low_filter = wavelet.dec_lo
    high_filter = wavelet.dec_hi
    filter_size = len(low_filter)
    try:
        for _ in range(level):
            padded_signal = padding_symmetric(signal, filter_size)
            low_pass_filtered_signal = convolve(padded_signal, low_filter)[filter_size:(2*filter_size)+len(signal)-1] 
            low_pass_filtered_signal = low_pass_filtered_signal[1:len(low_pass_filtered_signal):2]
            high_pass_filtered_signal = convolve(padded_signal, high_filter)[filter_size:filter_size+len(signal)+filter_size-1]
            high_pass_filtered_signal = high_pass_filtered_signal[1:len(high_pass_filtered_signal):2]
            approx_coeff.append(low_pass_filtered_signal)
            detail_coeff.append(high_pass_filtered_signal)
            signal = low_pass_filtered_signal
    except:
        raise
    low_reconstruction_filter = wavelet.rec_lo
    high_reconstruction_filter = wavelet.rec_hi
    real_lengths = []
    for i in range(level-2,-1,-1):
        real_lengths.append(len(approx_coeff[i]))
    real_lengths.append(original_len)
    restored_approx_coeff = []
    for i in range(level):
        restored_signal = restore_signal(approx_coeff[i], low_reconstruction_filter, real_lengths[level-1-i])
        for j in range(i):
            restored_signal = restore_signal(restored_signal, low_reconstruction_filter, real_lengths[level-i+j])
        restored_approx_coeff.append(restored_signal)
    restored_detail_coeff = []
    for i in range(level):
        restored_signal = restore_signal(detail_coeff[i], high_reconstruction_filter, real_lengths[level-1-i])
        for j in range(i):
            restored_signal = restore_signal(restored_signal, high_reconstruction_filter, real_lengths[level-i+j])
        restored_detail_coeff.append(restored_signal)
    return restored_approx_coeff, restored_detail_coeff 

def entropy_fn(signal):
    entropy_val = 0
    for i in signal:
        entropy_val += (i**2)*(np.log2(i**2))     
    return entropy_val

def energy_fn(signal):
    return np.sum(np.array(signal)**2)
        
import pywt
def dwt_fn(signal):
    #print(signal)
    #coeffs = pywt.wavedec(signal, 'db4', level=4) 
    restored_approx_coeff,restored_detail_coeff = DWTfn(signal, 4, 'db4') 
    d4, d3, d2, d1 = restored_detail_coeff 
    
#     print(len(d1))
#     print(len(d2))
#     print(len(d3))
#     print(len(d4))
#     raise Exception()
    
    bands = {'theta':d4,'alpha':d3,'beta':d2,'gamma':d1}
    
    band_instance = {}
    
    for band_name, band in bands.items():
        band_instance[f"{band_name}_entropy"] = entropy_fn(band)
        band_instance[f"{band_name}_energy"] = energy_fn(band)
    
    return band_instance

from scipy.stats import kurtosis, skew, entropy
def extract_time_domain_features(signal, verbose=False):
    mean = np.mean(signal)
    std = np.std(signal)
    rnge = np.max(signal) - np.min(signal)
    skewness = skew(signal)
    kurt = kurtosis(signal)
    hjorth_param_activity = std**2
    hjorth_param_mobility, hjorth_param_complexity = pyeeg.hjorth(signal)    
    #feature_vector = (mean,std,rnge,skewness,kurt,hjorth_param_activity,hjorth_param_mobility,hjorth_param_complexity)    
    feature_vector_dict = {"mean":mean,"std":std,"range":rnge,"skewness":skewness,"kurtosis":kurt,"hjorth_param_activity":hjorth_param_activity, "hjorth_param_mobility":hjorth_param_mobility, "hjorth_param_complexity":hjorth_param_complexity}
    
    if verbose : print(feature_vector_dict)
    return feature_vector_dict

# DWT Feature Extraction

In [5]:
from scipy import stats
def feature_extraction(subjects, channel=[1,7,15,17,25], window_size=640, step_size=320, sample_rate=128, timedomain=True, timefreq=True, baseline=False, directory='data_python'):
    usename=False
    chan_title=str(len(channel))+"chan"
    usename1=False
    usename2=False
    if channel == [1,7,15,17,25]:
        usename1=True
    if channel == [0,1,2,3,4]:
        usename2=True
    meta = []
    
    from os import path
    if baseline: feature = "dwt_baseline"
    else: feature = "dwt"
        
    tag_name = ""
    extension = "dat"
    if directory != "data_python":
        tag_name = "custom"
        extension = "mat"
    
    if timedomain and timefreq: csv_filename = f'data{tag_name}/{feature}/{chan_title}_time_timefreq_{int(window_size/128)}s-{step_size/128}step.csv'
    if timedomain and not timefreq: csv_filename = f'data{tag_name}/{feature}/{chan_title}_time_{int(window_size/128)}s-{step_size/128}step.csv'
    if not timedomain and timefreq: csv_filename = f'data{tag_name}/{feature}/{chan_title}_timefreq_{int(window_size/128)}s-{step_size/128}step.csv'
    print(csv_filename)
    from os import path
    if path.exists(csv_filename):
        print(f"{csv_filename} already exists.")
        return {"csv_path":csv_filename, "data":None}
    
    reuse_date_optimization = False
    if feature == "dwt_baseline":
        csv_filename_without = csv_filename.replace("dwt_baseline","dwt")
        if path.exists(csv_filename_without):
            print(f"{csv_filename_without} already exists. Will use as trial data.")
            reuse_date_optimization = True
            data_without = pd.read_csv(csv_filename_without)
    
    for sub in subjects:
        #print(f"Loading subject {sub}")
        subject_time = time.time()
        try:
            with open(f'../{directory}/s{sub}.{extension}', 'rb') as file:
                subject = pickle.load(file, encoding='latin1') #resolve the python 2 data problem by encoding : latin1
        except: 
            subject = sio.loadmat(f"../{directory}/s{sub}.{extension}")
            
            num_trials = len(subject["data"])
            for trial in range (0,num_trials):
                eeg = subject["data"][trial]

                val = 1 if subject["labels"][trial][0] >= 5 else 0
                aro = 1 if subject["labels"][trial][1] >= 5 else 0

                if val == 0 and aro == 0:
                    emotion = 0 #LALV
                if val == 0 and aro == 1:
                    emotion = 1 #HALV
                if val == 1 and aro == 0:
                    emotion = 2 #LAHV
                if val == 1 and aro == 1:
                    emotion = 3 #HAHV 
                
                three_sec = 128*3
                if baseline:
                    baseline_instance = {"Sub":sub, "Trial":trial, "Valence":val, "Arousal":aro, "Emotion":emotion}
                    for chan in channel:
                        if usename1:
                            if chan == 1: chan_name = "AF3"
                            elif chan == 7: chan_name = "T7"
                            elif chan == 15: chan_name = "Pz"
                            elif chan == 17: chan_name = "AF4"
                            elif chan == 25: chan_name =  "T8"
                        if usename2:
                            if chan == 0: chan_name = "AF3"
                            elif chan == 1: chan_name = "T7"
                            elif chan == 2: chan_name = "Pz"
                            elif chan == 3: chan_name = "AF4"
                            elif chan == 4: chan_name =  "T8"
                        baseline_slice = eeg[chan][0 : three_sec]
                                               
                        if window_size == 384:
                            #time domain                        
                            if timedomain:
                                time_domain_features = extract_time_domain_features(baseline_slice)
                                for feature_name,value in time_domain_features.items():
                                    if usename: baseline_instance[f"{chan_name}_{feature_name}"] = value
                                    else: baseline_instance[f"{chan}_{feature_name}"] = value
                            #time-frequency domain                            
                            if timefreq:
                                time_freq_feats = dwt_fn(baseline_slice)  
                                for key,value in time_freq_feats.items():
                                    if usename: baseline_instance[f"{chan_name}_{key}"] = value
                                    else: baseline_instance[f"{chan}_{key}"] = value
                                        
                        elif window_size == 128:
                            slices = [[0,128],[128,256],[256,384]]                                
                            for time_slice in slices:
                                baseline_mini_slice = baseline_slice[time_slice[0]:time_slice[1]]
                                if timedomain:
                                    time_domain_features = extract_time_domain_features(baseline_mini_slice)
                                    for feature_name,value in time_domain_features.items():
                                        if usename: 
                                            try: baseline_instance[f"{chan_name}_{feature_name}"] += value
                                            except: baseline_instance[f"{chan_name}_{feature_name}"] = value
                                        else: 
                                            try: baseline_instance[f"{chan}_{feature_name}"] += value
                                            except: baseline_instance[f"{chan}_{feature_name}"] = value
                                #time-frequency domain   
                                if timefreq:
                                    time_freq_feats = dwt_fn(baseline_mini_slice)  
                                    for key,value in time_freq_feats.items():                                        
                                        if usename: 
                                            try: baseline_instance[f"{chan_name}_{key}"] += value
                                            except: baseline_instance[f"{chan_name}_{key}"] = value
                                        else: 
                                            try: baseline_instance[f"{chan}_{key}"] += value
                                            except: baseline_instance[f"{chan}_{key}"] = value    
                                            
                                                              
                        else: raise Exception("Window size must be either 1 or 3 seconds long to use baseline.")
                    if window_size == 128:
                        info_keys = ["Sub","Trial", "Valence", "Arousal", "Emotion"] 
                        for key, value_bl in baseline_instance.items():
                            if key not in info_keys:
                                baseline_instance[key] = value_bl/3
                if not reuse_date_optimization:
                    start = three_sec
                    while start + window_size < eeg.shape[1]:
                        instance = {"Sub":sub, "Trial":trial, "Valence":val, "Arousal":aro, "Emotion":emotion}
                        for chan in channel:                        
                            eeg_slice = eeg[chan][start : start + window_size] 
                            eeg_standardized = stats.zscore(eeg_slice)                        

                            if usename1:
                                if chan == 1: chan_name = "AF3"
                                elif chan == 7: chan_name = "T7"
                                elif chan == 15: chan_name = "Pz"
                                elif chan == 17: chan_name = "AF4"
                                elif chan == 25: chan_name =  "T8"
                            if usename2:
                                if chan == 0: chan_name = "AF3"
                                elif chan == 1: chan_name = "T7"
                                elif chan == 2: chan_name = "Pz"
                                elif chan == 3: chan_name = "AF4"
                                elif chan == 4: chan_name =  "T8"

                            #time domain

                            if timedomain:
                                time_domain_features = extract_time_domain_features(eeg_slice)
                                for feature_name,value in time_domain_features.items():
                                    if usename: instance[f"{chan_name}_{feature_name}"] = value
                                    else: instance[f"{chan}_{feature_name}"] = value

                            #time-frequency domain    

                            if timefreq:
                                time_freq_feats = dwt_fn(eeg_slice)  
                                for key,value in time_freq_feats.items():
                                    if usename: instance[f"{chan_name}_{key}"] = value
                                    else: instance[f"{chan}_{key}"] = value   

                        if baseline:
                            #print(baseline_instance)
                            info_keys = ["Sub","Trial", "Valence", "Arousal", "Emotion"]
                            for key,value_from_key_ffs in instance.items():
                                if key not in info_keys:
                                    #print(value_from_key_ffs)
                                    #print(baseline_instance[key])
                                    instance[key] = value_from_key_ffs - baseline_instance[key]
                                    #print(instance[key],"\n")
                        meta.append(instance)    
                        start = start + step_size
                else:
                    subject_num = int(sub.replace('0',''))
                    data_to_use = data_without.loc[(data_without['Sub']==subject_num) & (data_without['Trial']==trial)].drop(["Sub", "Trial", "Valence", "Arousal", "Emotion"],axis=1)                   
                    #baseline subtraction
                    data_to_use = data_to_use.to_dict(orient='records')
                    for row in data_to_use:
                        instance = {"Sub":sub, "Trial":trial, "Valence":val, "Arousal":aro, "Emotion":emotion}
                        for key,value in row.items():
                            instance[key] = value - baseline_instance[key]
                        meta.append(instance)  
        print(f"Completed subject {sub} in {round(time.time()-subject_time,2)}s")
        
    df = pd.DataFrame(meta)   
    
    df.to_csv(csv_filename,index=False)
        
    return {"csv_path":csv_filename, "data":df}

In [6]:
# sample_rate = 128 

# subject_list = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32']
# #subject_list = ['01']
# channels = list(range(0,32)) #32chan
# #channels = [1,7,15,17,25] #5chan
# # list(range(0,32)) 32chan
# # [0,16,2,19]

# window_in_sec = 5
# window_size = window_in_sec*sample_rate
# #step_size = int(window_size/2)
# step_size = window_size

# timedomain=False
# timefreq=True

# start_time = time.time()
# data = feature_extraction(subjects=subject_list, channel=channels, window_size=window_size, step_size=step_size, timedomain=timedomain, timefreq=timefreq)
# print(f"Size of generated data: {data.shape}")
# print(f"Time taken to process dataset: {round(time.time()-start_time,2)}s.")

In [27]:
def find_best_params(target, df, random_state=1):
    
    if target == "val" or target == "valaro": y = df.Valence
    if target == "aro" or target == "aroval": y = df.Arousal
    if target == "val" or target == "aro": x = df.drop(['Sub','Emotion','Trial','Valence','Arousal'],axis=1)
    if target == "valaro": x = df.drop(['Sub','Emotion','Trial','Valence'],axis=1)
    if target == "aroval": x = df.drop(['Sub','Emotion','Trial','Arousal'],axis=1)
        
    skf = StratifiedKFold(n_splits = 3, shuffle=True, random_state = random_state)        
    for train_index, test_index in skf.split(x,y):
        index_to_keep = test_index
        break
    
    hold_out_test_x = x[x.index.isin(index_to_keep)]
    hold_out_test_y = y[y.index.isin(index_to_keep)]
    
    x = x[~x.index.isin(index_to_keep)].reset_index(drop=True)
    y = y[~y.index.isin(index_to_keep)].reset_index(drop=True)
    
    skf = StratifiedKFold(n_splits = 7, shuffle=True, random_state = random_state)
    param_grid = {'C':[1, 50, 100, 200, 300],'gamma':[0.00001,0.001,1, 50, 100], 'kernel':['rbf']}      
    grid = GridSearchCV(SVC(), param_grid, refit = True, verbose=0, n_jobs=1, cv=skf.split(x,y), scoring = 'accuracy')
    
    scaler = StandardScaler()
    x = scaler.fit_transform(x) 
    grid.fit(x, y)
    
    best_parameters = grid.best_params_

    return best_parameters
    

In [41]:
def convert_dict_to_string(dictionary):
    hyperstring = ""
    for key,value in dictionary.items():
        hyperstring+= f"{key}:{value}, "
    hyperstring = hyperstring[:-2]
    return hyperstring

from sklearn.metrics import recall_score, precision_score, f1_score
def subj_dept_target(target = None, df = None, hyperparameters=None, random_state=1, verbose=True):    
    subject_dfs = []
    for subject in range(1,33):
        subject_dfs.append(df[df['Sub'].isin([subject])].reset_index(drop=True))
    
    #gridsearch on one subject to find best params 
    #params = find_best_params(target,subject_dfs[0])
    
    #iterate over all subjects    
    full_start_time = time.time()
    
    acc = []
    prec = []
    rec = []
    f1 = []
    num = 1
    print(f"{target}")
    
    param_subs = {}
    
    for df in subject_dfs:
        start_time = time.time()
        params = find_best_params(target,df)
        
        param_subs[f"Subject{num}_{target}"] = convert_dict_to_string(params)
        #print(f"sub {num}...")
        #subj_acc = leave_one_out(target=target, df=df, hyperparameters=hyperparameters, random_state=1, verbose=False)
        #print(num, np.mean(subj_acc))
        if target == "val" or target == "valaro": y = df.Valence
        if target == "aro" or target == "aroval": y = df.Arousal

        if target == "val" or target == "aro": x = df.drop(['Sub','Emotion','Trial','Valence','Arousal'],axis=1)
        if target == "valaro": x = df.drop(['Sub','Emotion','Trial','Valence'],axis=1)
        if target == "aroval": x = df.drop(['Sub','Emotion','Trial','Arousal'],axis=1)
        scaler = StandardScaler()
        x = scaler.fit_transform(x) 
        
        svm = SVC()
        svm.set_params(**params)
        skf = StratifiedKFold(n_splits = 8, shuffle=True, random_state = random_state)        
        
        subj_acc = []
        subj_prec = []
        subj_rec = []
        subj_f1 = []
        
        for train_index, test_index in skf.split(x,y):
            x_train_fold, x_test_fold = x[train_index], x[test_index] 
            y_train_fold, y_test_fold = y[train_index], y[test_index] 
            svm.fit(x_train_fold, y_train_fold) 

            score_fold = svm.score(x_test_fold, y_test_fold)
            subj_acc.append(score_fold)
            
            y_pred = svm.predict(x_test_fold)
            #subj_prec.append(precision_score(y_test_fold, y_pred))
            #subj_rec.append(recall_score(y_test_fold, y_pred))
            #subj_f1.append(f1_score(y_test_fold, y_pred))
            
        acc.append(np.mean(subj_acc))
#         prec.append(np.mean(subj_prec))
#         rec.append(np.mean(subj_rec))
#         f1.append(np.mean(subj_f1))        
        
#         dict_res = {'Accuracy': subj_acc, 'Precision': subj_prec, 'Recall':subj_rec, 'F1': subj_f1} 
#         df_res = pd.DataFrame(dict_res)        
#         print("Subject",num,":",target)
#         print(df_res.describe())
        num+=1
        #print(f"... took {round(time.time()-start_time,2)}s.")
        if num == 2: print(f"[Estimated time for all subs: ~{round(time.time()-start_time,2) * 32}s]")
        
    if verbose: print(f"{target} completed - {time.time()-full_start_time}s")
    #return (np.mean(acc), params)
    return (np.mean(acc), np.std(acc), acc, param_subs)

In [42]:
from sklearn.preprocessing import StandardScaler
#from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.model_selection import GridSearchCV
import time    

def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def get_params_from_indept(dependency, model_name, domain, channels, window_size, step_size):
    results_indept = pd.read_csv('Results/Results - Indept - CUSTOM.csv')
    
    results_indept = results_indept.loc[results_indept['Domain'] == domain]
    results_indept = results_indept.loc[results_indept['Window Size'] == window_size]
    results_indept = results_indept.loc[results_indept['Channels'] == channels]
    
    try: params = [results_indept['HP_VAL'].iloc[0], results_indept['HP_ARO'].iloc[0], results_indept['HP_VALARO'].iloc[0], results_indept['HP_AROVAL'].iloc[0]]
    except: params = ['C:200', 'C:200', 'C:200', 'C:200']
    params_dicts = []
    #params = ['C:200', 'C:200', 'C:200', 'C:200']
    for param in params:
        param = param.split(", ")
        current_dict = {}
        for p in param:
            p = p.split(":")
            if p[1] == "None": current_dict[p[0]] = None
            elif "." in p[1]: current_dict[p[0]] = float(p[1])
            elif hasNumbers(p[1]): current_dict[p[0]] = int(p[1])
            else: current_dict[p[0]] = p[1]            
        params_dicts.append(current_dict)
    
    return params_dicts[0], params_dicts[1], params_dicts[2], params_dicts[3]

def convert_dict_to_string(dictionary):
    hyperstring = ""
    for key,value in dictionary.items():
        hyperstring+= f"{key}:{value}, "
    hyperstring = hyperstring[:-2]
    return hyperstring

def subj_dept(csv_path=None, verbose=False, results_csv="results/Results - Dept - Subject - CUSTOM - Hyper.csv", baseline=False, custom=False):
    dependency = "Dependent"
    model_name = "SVM"
    
    if 'time_timefreq' in csv_path:
        domain = "T-TF"
    elif 'timefreq_' in csv_path and 'time_timefreq_' not in csv_path:
        domain = "TF"
    elif 'time_' in csv_path and 'time_timefreq_' not in csv_path:
        domain = "T"
    
    channels = int(str(str(csv_path.split("/")[-1]).split("_")[0]).split("chan")[0])
    
    window_size = int(str(str(str(csv_path.split("/")[-1]).split("_")[-1]).split("-")[0]).split("s")[0])
    step_size = str(str(str(csv_path.split("/")[-1]).split("_")[-1]).split("-")[1]).split("step")[0]
    
    #----------------------------------------------------------------
    
    start_time = time.time()
    
    #get hyperparams for each case from independent
    val_params, aro_params, valaro_params, aroval_params = get_params_from_indept(dependency, model_name, domain, channels, window_size, step_size)    
    
    df = pd.read_csv(csv_path)
    val,valstd,val_sub_list,val_param_subs = subj_dept_target(target = "val", df = df, hyperparameters=val_params)
    aro,arostd,aro_sub_list,aro_param_subs = subj_dept_target(target = "aro", df = df, hyperparameters=aro_params)
    valaro,valarostd,valaro_sub_list,valaro_param_subs = subj_dept_target(target = "valaro", df = df, hyperparameters=valaro_params)
    aroval,arovalstd,aroval_sub_list,aroval_param_subs = subj_dept_target(target = "aroval", df = df, hyperparameters=aroval_params)
    
    end_time = round(time.time()-start_time,2)
    
    if verbose:
        print(f"VAL: {val}")
        print(f"ARO: {aro}")
        print(f"VALARO: {valaro}")
        print(f"AROVAL: {aroval}")
        
        print(f"VAL x ARO: {val * aro}")
        print(f"VAL x AROVAL: {val * aroval}")
        print(f"ARO x VALARO: {aro * valaro}")
    
    results = pd.read_csv(results_csv)
    
    dependency = "Dependent"
    model_name = "SVM"
    if baseline: model_name += "-Baseline"
    if custom: model_name += "-CUSTOM"
    
    if 'time_timefreq' in csv_path:
        domain = "T-TF"
    elif 'timefreq_' in csv_path and 'time_timefreq_' not in csv_path:
        domain = "TF"
    elif 'time_' in csv_path and 'time_timefreq_' not in csv_path:
        domain = "T"
    
    channels = int(str(str(csv_path.split("/")[-1]).split("_")[0]).split("chan")[0])
    
    window_size = int(str(str(str(csv_path.split("/")[-1]).split("_")[-1]).split("-")[0]).split("s")[0])
    step_size = str(str(str(csv_path.split("/")[-1]).split("_")[-1]).split("-")[1]).split("step")[0]
    
    combo_dicts = {"VALxARO":val * aro, "VALxAROVAL":val * aroval, "AROxVALARO":aro * valaro}
    combo_dicts = dict(sorted(combo_dicts.items(), key=lambda item: item[1], reverse=True))    
    
    best_combination = list(combo_dicts.keys())[0]
    best_acc = list(combo_dicts.values())[0]
    
    val_params = convert_dict_to_string(val_params)
    aro_params = convert_dict_to_string(aro_params)
    valaro_params = convert_dict_to_string(valaro_params)
    aroval_params = convert_dict_to_string(aroval_params)
    
    time_taken = end_time
    
    val = round(val*100,2)
    valstd = round(valstd*100,2)
    aro = round(aro*100,2)
    arostd = round(arostd*100,2)
    valaro = round(valaro*100,2)
    valarostd = round(valarostd*100,2)
    aroval = round(aroval*100,2)
    arovalstd = round(arovalstd*100,2)
    
    for tar in ["VAL", "ARO", "VALARO", "AROVAL"]:
        if tar == "VAL": 
            mean = val
            std = valstd
            subs = val_sub_list
        if tar == "ARO": 
            mean = aro
            std = arostd
            subs = aro_sub_list
        if tar == "VALARO": 
            mean = valaro
            std = valarostd
            subs = valaro_sub_list
        if tar == "AROVAL": 
            mean = aroval
            std = arovalstd
            subs = aroval_sub_list
        
        sub_results = {}
        sub_names = []
        subcount = 1
        for sub in subs:
            sub_results[f"Subject {subcount}"] = round(subs[subcount-1]*100,2)
            sub_names.append(f"Subject {subcount}")
            subcount += 1
        
        new_result = {"Dependency":dependency, "Model Name":model_name, "Domain":domain, "Channels":channels, 
                      "Window Size":window_size, "Step Size": step_size, "Hyperparams": "C:200 on each", 
                      "Time":time_taken, "Target":tar, "Mean":mean, "Std": std, "Best Combination": best_combination,
                      "Best Accuracy": best_acc}
        
        new_result = {**new_result, **sub_results, **val_param_subs, **aro_param_subs, **valaro_param_subs,**aroval_param_subs}
        
        results = results.append(new_result, ignore_index=True)
        
        col_names = ["Dependency","Model Name","Domain","Channels", "Window Size","Step Size",
                           "Hyperparams","Time","Target","Mean","Std","Best Combination", "Best Accuracy"]
        col_names = col_names+sub_names
        col_names = col_names+list(val_param_subs.keys())+list(aro_param_subs.keys())+list(valaro_param_subs.keys())+list(aroval_param_subs.keys())
        results = results[col_names]  
    
    results.to_csv(results_csv, index=False)
    print(f"Completed. - {dependency}, {domain}, {channels}, {window_size}-{step_size} in {time_taken}s")

In [44]:
sample_rate = 128 
subject_list = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32']

for channels in [[0,1,2,3,4]]:
    for timefreq in [True]:
        for timedomain in [False]:
#             if timedomain == False and timefreq == False:
#                 continue
            for window_in_sec in [1,3,5,7,9]:
                for baseline in [False, True]:
                    if baseline == True and window_in_sec > 3: continue  
                    window_size = window_in_sec * sample_rate
                    step_size = window_size

                    start_time = time.time()
                    data = feature_extraction(subjects=subject_list, channel=channels, window_size=window_size, step_size=step_size, timedomain=timedomain, timefreq=timefreq, baseline=baseline, directory='DEAP_5chan_custom_preproc')
                    print(f"Time taken to process dataset: {round(time.time()-start_time,2)}s.")
                    subj_dept(csv_path=data['csv_path'], verbose=True, baseline=baseline, custom=True)
                    
for channels in [list(range(0,32))]:
    for timefreq in [True]:
        for timedomain in [False]:
#             if timedomain == False and timefreq == False:
#                 continue
            for window_in_sec in [1,3,5,7,9]:
                for baseline in [False, True]:
                    if baseline == True and window_in_sec > 3: continue  
                    window_size = window_in_sec * sample_rate
                    step_size = window_size

                    start_time = time.time()
                    data = feature_extraction(subjects=subject_list, channel=channels, window_size=window_size, step_size=step_size, timedomain=timedomain, timefreq=timefreq, baseline=baseline)
                    print(f"Time taken to process dataset: {round(time.time()-start_time,2)}s.")
                    subj_dept(csv_path=data['csv_path'], verbose=True, baseline=baseline, custom=False)

datacustom/dwt/5chan_timefreq_1s-1.0step.csv
datacustom/dwt/5chan_timefreq_1s-1.0step.csv already exists.
Time taken to process dataset: 0.0s.
val
[Estimated time for all subs: ~949.44s]
val completed - 967.8207166194916s
aro
[Estimated time for all subs: ~870.4s]
aro completed - 1059.895129442215s
valaro
[Estimated time for all subs: ~983.04s]
valaro completed - 929.1666326522827s
aroval
[Estimated time for all subs: ~1008.64s]
aroval completed - 991.968715429306s
VAL: 0.7111161507464786
ARO: 0.7032362167946139
VALARO: 0.7433473305544105
AROVAL: 0.7393364599139851
VAL x ARO: 0.500082631552502
VAL x AROVAL: 0.5257540974805612
ARO x VALARO: 0.522748764503459
Completed. - Dependent, TF, 5, 1-1.0 in 3949.67s
datacustom/dwt_baseline/5chan_timefreq_1s-1.0step.csv
datacustom/dwt_baseline/5chan_timefreq_1s-1.0step.csv already exists.
Time taken to process dataset: 0.0s.
val
[Estimated time for all subs: ~875.84s]
val completed - 861.5162041187286s
aro
[Estimated time for all subs: ~921.6s]
ar

[Estimated time for all subs: ~520.0s]
valaro completed - 486.5367522239685s
aroval
[Estimated time for all subs: ~505.6s]
aroval completed - 484.8875389099121s
VAL: 0.7014914772727272
ARO: 0.7064630681818181
VALARO: 0.7292613636363636
AROVAL: 0.7428977272727273
VAL x ARO: 0.495577821337487
VAL x AROVAL: 0.521136424167097
ARO x VALARO: 0.515196220461002
Completed. - Dependent, TF, 32, 5-5.0 in 1942.37s
data/dwt/32chan_timefreq_7s-7.0step.csv
data/dwt/32chan_timefreq_7s-7.0step.csv already exists.
Time taken to process dataset: 0.0s.
val
[Estimated time for all subs: ~371.2s]
val completed - 357.19659090042114s
aro
[Estimated time for all subs: ~365.76s]
aro completed - 356.30566143989563s
valaro
[Estimated time for all subs: ~370.88s]
valaro completed - 357.2316801548004s
aroval
[Estimated time for all subs: ~368.64s]
aroval completed - 356.9796669483185s
VAL: 0.7023437499999999
ARO: 0.705859375
VALARO: 0.73193359375
AROVAL: 0.73447265625
VAL x ARO: 0.49575592041015615
VAL x AROVAL: 0.