In [28]:
import multiprocessing as mp
import pandas as pd
import numpy as n
import matplotlib.pyplot as plt
import glob
from scipy.signal import butter, lfilter, freqz
from sklearn.decomposition import FastICA, PCA
from sklearn.lda import LDA
from scipy.stats import kurtosis, skew, stats
import pickle
%matplotlib inline

In [169]:
def preprocess(dfile):
    data_tbl = pd.read_csv(dfile)
    fdata= data_tbl.copy()
    ids= fdata['id'].values
    #get rid of pesky subject id that is repeated for N time points
    fdata.drop('id',axis=1,inplace=True) 
    #low pass filter
    fs = 500 #Hz
    lowcut = 2 #Hz
    highcut = 8 #Hz
    for k in fdata.keys():
        fdata[k] = butter_bandpass_filter(fdata[k], lowcut, highcut, fs, order=4)
    return (fdata,ids)    

#low band pass filter all channels
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = .5 * fs
    low = lowcut/nyq
    high = highcut/nyq
    b,a = butter(order, [low,high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b,a = butter_bandpass(lowcut, highcut, fs, order=order)
    y=lfilter(b, a, data)
    return y

def get_features(bucket):
# feat[extra] = take std dev over some N pts > 30 then compute delta std / delta t
#     feat[extra]= sum_channels_over_all_series_for_same_event_type
#power spectrum of noise, first 2 dominant frequencies? -->need sample more than 50 time points
    feat=n.zeros(10)
    feat[0]= n.median(bucket)
    feat[1]= bucket.mean()
    feat[2]= bucket.min()
    feat[3]= bucket.max()
    feat[4]= n.std(bucket)
    feat[5]= skew(bucket)
    feat[6]= kurtosis(bucket)
    slope, intercept, r_value, p_value, std_err = \
        stats.linregress(n.arange(len(bucket)),bucket)
    feat[7]= slope
    feat[8]= r_value
    feat[9]= n.absolute(bucket).sum()
    return feat

def real_time_features(dfile):
    #get data
    (data_tbl,ids)= preprocess(dfile)
    n_points=50
    n_features= 10
    vec_feat= n.zeros((n_features,data_tbl.shape[0]))-1
    #get features
#     if tend == -1: tend= data_tbl.values.copy().shape[0]
    for t in range(50,data_tbl.values.copy().shape[0]):
        if t == 100 or t == 1000 or t == 10000 or t == 100000: print 't= ',t   
        vec_feat[:,t]= get_features( n.mean(data_tbl.values[t-n_points:t,:],axis=1) ) #50 time pts
    return (vec_feat,ids)

In [171]:
datafiles= glob.glob('../test/*_data.csv')
for dfile in datafiles:
    (feat,ids)= real_time_features(dfile)
    fname= dfile[:-4]+"_features.pickle"
    f= open(fname,"w")
    pickle.dump((feat,ids),f)
    f.close()


t=  100
t=  1000
t=  10000
t=  100000


In [180]:
dfile[:-4]+"_features.pickle",ids.shape,feat.shape

('../test/subj10_series10_data_features.pickle', (128906,), (10, 128906))

In [136]:
f=open('test.csv','a')
f.write('id,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17\n')
line='%s'
for i in range(n_features): line=line+ ',%.2f'
line=line+'\n'
a=feat[:,60]
print a.shape,a[0],line,ids[0],type(ids[0])
f.write(line % (ids[0],a[0],a[1],a[2],a[3],a[4],a[5],\
               a[6],a[7],a[8],a[9],a[10],a[11],\
               a[12],a[13],a[14],a[15],a[16]))
f.close()
# n.savetxt(f,range(10),delimiter=',')
# f.close()

(17,) -1.0 %s,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f
subj10_series10_0 <type 'str'>


In [137]:
mydata= pd.read_csv('test.csv')
# np.savetxt( fname.replace('csv', 'feats.csv'), allfeatures, delimiter=',', header=','.join(F.feature_names) )

In [139]:
# mydata.values[:,0]-feat[:,50]
mydata

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17
0,subj10_series10_0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,subj10_series10_0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,subj10_series10_0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,subj10_series10_0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [38]:
pool = mp.Pool(4)
pool.map( real_time_features,dfile)

ValueError: No columns to parse from file

In [22]:
feat[0,50:60]

array([ 160.66605994,  162.35086644,  163.70384041,  165.8717153 ,
        169.05707906,  172.64400541,  176.01915347,  176.76141942,
        175.01728959,  174.89888739])