In [1]:
import scipy.io as sio
import glob
import numpy
import time

In [2]:
def load_patient_train_data(paths):
# Load training data for patient


    X = []
    Y = []

    print('...loading train data')
    start = time.time()

    for path in sorted(glob.glob(paths), key=numericalSort):
        X.append(sio.loadmat(path))
        Y.append(int(path[-5]))
    
    Y = numpy.array(Y)
    print('time elapsed: %s sec' %(time.time() - start))
    
    
    return(X, Y)

In [3]:
def load_patient_test_data(paths):
# Load training data for patient


    X = []
    file_array = []

    print('...loading test data')
    start = time.time()

    for path in sorted(glob.glob(paths), key=numericalSort):
        X.append(sio.loadmat(path))
        file_array.append(os.path.split(path)[1])
    
    print('time elapsed: %s sec' %(time.time() - start))
    
    
    return(X, file_array)

In [4]:
def get_channel1_data(X):
# extract data from channel 1    
    
    
    data_channel1 = numpy.zeros([240000, len(X)])

    for i in xrange(len(X)):
        data_channel1[:,i] = X[i]['dataStruct']['data'][0][0][:,0]
    
    
    return(data_channel1)

In [5]:
# The numericalSort function splits out any digits in a filename, 
# turns it into an actual number, and returns the result for sorting

import re
numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

In [6]:
# load training data
X_train1, Y_train1 = load_patient_train_data('F:/Kaggle/Seizure Prediction/train_1/*.mat')

...loading train data
time elapsed: 117.410000086 sec


In [7]:
len(Y_train1)

1300

In [9]:
X_train1[0]['dataStruct']['data'][0][0].shape

(240000L, 16L)

In [10]:
samples = len(Y_train1)
channels = 16
mean_train1 = numpy.zeros([samples, channels])

for sample in xrange(samples):
    for channel in xrange(channels):
        mean_train1[sample, channel] = numpy.mean(X_train1[sample]['dataStruct']['data'][0][0][:, channel])

In [11]:
mean_train1.shape

(1300L, 16L)

In [13]:
numpy.mean(X_train1[0]['dataStruct']['data'][0][0][:, 0])

8.4431969e-07

In [14]:
numpy.sum(X_train1[0]['dataStruct']['data'][0][0][:, 0])

0.20263672

In [16]:
numpy.sum(X_train1[0]['dataStruct']['data'][0][0][:, 0])/float(240000)

8.4431966145833331e-07

In [17]:
mean_train1[0:10,0]

array([  8.44319686e-07,  -8.06681328e-07,  -4.17073579e-07,
        -3.82486974e-07,   5.47281900e-07,  -1.10270184e-06,
        -1.11592612e-06,   1.23697919e-06,  -8.13802075e-08,
         8.91113302e-07])

In [18]:
# most likely these values oscillate around 0 so let's take abs(X) instead
samples = len(Y_train1)
channels = 16
mean_train1 = numpy.zeros([samples, channels])

for sample in xrange(samples):
    for channel in xrange(channels):
        mean_train1[sample, channel] = numpy.mean(numpy.absolute(X_train1[sample]['dataStruct']['data'][0][0][:, channel]))

In [19]:
mean_train1[0:10,0]

array([ 25.07297707,  29.28290176,  20.03593636,  27.96895599,
        25.06256676,  23.67510223,  26.57349777,  25.45344162,
        20.55765343,  24.8100853 ])

In [20]:
# compute variance on dataset
samples = len(Y_train1)
channels = 16
variance_train1 = numpy.zeros([samples, channels])

for sample in xrange(samples):
    for channel in xrange(channels):
        variance_train1[sample, channel] = numpy.var(X_train1[sample]['dataStruct']['data'][0][0][:, channel])

In [21]:
variance_train1[0:10,0]

array([ 1102.56604004,  1469.39245605,   670.84771729,  1375.37243652,
        1094.80200195,  1006.19824219,  1212.58251953,  1138.57958984,
         982.41186523,  1067.45507812])

In [24]:
import matplotlib.pyplot as plt

In [27]:
plt.plot(mean_train1[0:,0], variance_train1[0:,0], "o")
plt.show()

In [28]:
Y_train1

array([0, 1, 0, ..., 0, 0, 0])

In [34]:
pos_mean_train1 = mean_train1[Y_train1 == 1]
neg_mean_train1 = mean_train1[Y_train1 == 0]

pos_variance_train1 = variance_train1[Y_train1 == 1]
neg_variance_train1 = variance_train1[Y_train1 == 0]

In [31]:
pos_mean_train1.shape

(149L, 16L)

In [32]:
Y_train1.sum()

149

In [36]:
plt.plot(pos_mean_train1[0:,0], pos_variance_train1[0:,0], "o", neg_mean_train1[0:,0], neg_variance_train1[0:,0], "x")
plt.show()

In [40]:
from scipy import stats

In [41]:
a = scipy.stats.skew(X_train1[0]['dataStruct']['data'][0][0][:, 0])

In [42]:
a

-0.2616947293281555

In [44]:
import time

In [46]:
start = time.time()
end = time.time() - start
end

0.0

In [43]:
# compute variance on dataset
samples = len(Y_train1)
channels = 16
skew_train1 = numpy.zeros([samples, channels])

print('calculating ...')
start = time.time()


for sample in xrange(samples):
    for channel in xrange(channels):
        skew_train1[sample, channel] = scipy.stats.skew(X_train1[sample]['dataStruct']['data'][0][0][:, channel])
     
    
print('time elapsed: %s sec' %(time.time() - start))

In [47]:
skew_train1.shape

(1300L, 16L)

In [48]:
pos_skew_train1 = skew_train1[Y_train1 == 1]
neg_skew_train1 = skew_train1[Y_train1 == 0]

In [51]:
plt.plot(pos_skew_train1[0:,0], pos_variance_train1[0:,0], "o", neg_skew_train1[0:,0], neg_variance_train1[0:,0], "x")
plt.xlim([-5,5])
plt.show()

In [52]:
# compute variance on dataset
samples = len(Y_train1)
channels = 16
kurtosis_train1 = numpy.zeros([samples, channels])

print('calculating ...')
start = time.time()


for sample in xrange(samples):
    for channel in xrange(channels):
        kurtosis_train1[sample, channel] = scipy.stats.kurtosis(X_train1[sample]['dataStruct']['data'][0][0][:, channel])
     
    
print('time elapsed: %s sec' %(time.time() - start))

calculating ...
time elapsed: 22.5439999104 sec


In [53]:
pos_kurtosis_train1 = kurtosis_train1[Y_train1 == 1]
neg_kurtosis_train1 = kurtosis_train1[Y_train1 == 0]

In [55]:
plt.plot(pos_kurtosis_train1[0:,0], pos_variance_train1[0:,0], "o", neg_kurtosis_train1[0:,0], neg_variance_train1[0:,0], "x")
plt.xlim([-100,100])
plt.show()

In [57]:
import pyeeg

ImportError: No module named pyeeg