# MODEL TRAINING

## Imports

In [38]:
import pywt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob

from math import sqrt,log10
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import pickle
import joblib

## Denoising

In [2]:
def madev(d, axis=None):
    """ Median absolute deviation of a signal """
#     return np.mean(np.absolute(d - np.mean(d, axis)), axis)
    return np.median(np.absolute(d))

def wavelet_denoising(x):
    c = pywt.wavedec(x,"sym18", mode="per",level = 4)
    
    sigma = (1/0.6745) * madev(c[-1])
    
    univ_thresh = sigma * np.sqrt(2 * np.log(len(x)))
    
    c[1:] = (pywt.threshold(i, value=univ_thresh, mode='hard') for i in c[1:])
    
    return pywt.waverec(c, "sym18", mode='per')

## Data PreProcessing 

In [3]:
train_fnames =  glob('data/train/subj*_data.csv')
test_fnames =  glob('data/test/subj*_data.csv')

In [4]:
def preprocess_data(fname):
    datax = pd.read_csv(fname)
    datax.drop(['id'], axis = 1, inplace=True)
    cols = datax.columns

    datax = pd.DataFrame(wavelet_denoising(datax))
    datax.columns = cols
    # get filename of labels from data filename
    events = fname.replace('_data','_events') 
    datay = pd.read_csv(events)
#     print(len(datax.columns))

    datay.drop(['id'], axis = 1, inplace=True)
    
    datax.drop([x for x in datax.columns if 'C' not in x], axis = 1, inplace=True)

    for col in datay.columns:
        datax[f'{col}_output'] = datay[col]
    
    return datax
  
# preprocess_data(fnames[0])

In [5]:
train_data = pd.concat([preprocess_data(fname) for fname in train_fnames])

# train_Dic = {}

# for i in range(6):
#     train_Dic[f'train_data{i+1}'] = pd.concat([train_data.iloc[:,:11] , train_data.iloc[:,11+i]],axis=1, join='inner')



In [6]:
# train_Dic['train_data6']

In [7]:
test_data = pd.concat([preprocess_data(fname) for fname in test_fnames])

# test_Dic = {}

# for i in range(6):
#     test_Dic[f'test_data{i+1}'] = pd.concat([test_data.iloc[:,:11] , test_data.iloc[:,11+i]],axis=1, join='inner')



In [8]:
# test_Dic['test_data3']

In [9]:
test_data.iloc[1,:11]


FC5   -376.130104
FC1   -383.550627
FC2   -391.048872
FC6   -398.338283
C3    -411.192561
Cz    -416.260565
C4    -420.144883
CP5   -423.430891
CP1   -421.601333
CP2   -418.387447
CP6   -413.916256
Name: 1, dtype: float64

In [22]:
# logR1 = LogisticRegression()

# logR1.fit(train_data.iloc[:,:11] , train_data.iloc[:,11])

In [23]:
# list(logR1.predict(test_data.iloc[:,:11])).count(1)

In [24]:
# logR1.score(test_data.iloc[:10000,:11] , test_data.iloc[:10000,11])

In [25]:
# model = svm.SVC()
# model.fit(train_data.iloc[:,:11] , train_data.iloc[:,11])

In [26]:
# model.predict(test_data.iloc[1000:10000,:11])

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
clf1 = DecisionTreeClassifier()
clf1.fit(train_data.iloc[:,:11] , train_data.iloc[:,11])
list(clf1.predict(test_data.iloc[:,:11])).count(1)

14858

In [17]:
clf2 = DecisionTreeClassifier()
clf2.fit(train_data.iloc[:,:11] , train_data.iloc[:,12])
list(clf2.predict(test_data.iloc[:,:11])).count(1)

16078

In [18]:
clf3 = DecisionTreeClassifier()
clf3.fit(train_data.iloc[:,:11] , train_data.iloc[:,13])
list(clf3.predict(test_data.iloc[:,:11])).count(1)

16029

In [19]:
clf4 = DecisionTreeClassifier()
clf4.fit(train_data.iloc[:,:11] , train_data.iloc[:,14])
list(clf4.predict(test_data.iloc[:,:11])).count(1)

16587

In [20]:
clf5 = DecisionTreeClassifier()
clf5.fit(train_data.iloc[:,:11] , train_data.iloc[:,15])
list(clf5.predict(test_data.iloc[:,:11])).count(1)

18463

In [21]:
clf6 = DecisionTreeClassifier()
clf6.fit(train_data.iloc[:,:11] , train_data.iloc[:,16])
list(clf6.predict(test_data.iloc[:,:11])).count(1)

17922

In [29]:
ab1 = list(clf1.predict(test_data.iloc[:,:11]))
print((ab1.count(1)/len(ab1))*100)
ab2 = list(clf2.predict(test_data.iloc[:,:11]))
print((ab1.count(1)/len(ab2))*100)
ab3 = list(clf3.predict(test_data.iloc[:,:11]))
print((ab1.count(1)/len(ab3))*100)
ab4 = list(clf4.predict(test_data.iloc[:,:11]))
print((ab1.count(1)/len(ab4))*100)
ab5 = list(clf5.predict(test_data.iloc[:,:11]))
print((ab1.count(1)/len(ab5))*100)
ab6 = list(clf6.predict(test_data.iloc[:,:11]))
print((ab1.count(1)/len(ab6))*100)

2.332825673957074
2.332825673957074
2.332825673957074
2.332825673957074
2.332825673957074
2.332825673957074


In [36]:
dbfile1 = open('pickle1.pkl', 'wb')
pickle.dump(clf1, dbfile1)                     
dbfile1.close()
dbfile1 = open('pickle2.pkl', 'wb')
pickle.dump(clf2, dbfile1)                     
dbfile1.close()
dbfile1 = open('pickle3.pkl', 'wb')
pickle.dump(clf3, dbfile1)                     
dbfile1.close()
dbfile1 = open('pickle4.pkl', 'wb')
pickle.dump(clf4, dbfile1)                     
dbfile1.close()
dbfile1 = open('pickle5.pkl', 'wb')
pickle.dump(clf5, dbfile1)                     
dbfile1.close()
dbfile1 = open('pickle6.pkl', 'wb')
pickle.dump(clf6, dbfile1)                     
dbfile1.close()

In [39]:
joblib.dump(clf1, 'pickle1.pkl')
joblib.dump(clf2, 'pickle2.pkl')
joblib.dump(clf3, 'pickle3.pkl')
joblib.dump(clf4, 'pickle4.pkl')
joblib.dump(clf5, 'pickle5.pkl')
joblib.dump(clf6, 'pickle6.pkl')

['pickle6.pkl']