In [1]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing
from sklearn.svm import SVC, OneClassSVM
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.model_selection import StratifiedKFold, GridSearchCV, ParameterGrid
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from copy import deepcopy
import os

from sklearn.pipeline import Pipeline
%matplotlib inline 
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib
from mpl_toolkits.mplot3d import Axes3D #, axes3d

import pywt
from scipy.stats import kstat,moment,kstatvar


In [2]:
def k_order_feats(data,order):
    ind = np.arange(0,data.shape[1]*(len(order)+2),(len(order)+2))
#     print(ind)
    k_order_feats_ = np.zeros((1,data.shape[1]*(len(order)+2)))
    for i in range(data.shape[1]): # for each prefeature
        kstat_feats = np.zeros((1,len(order)))
        kstatvar_feats = np.zeros((1,2))
        for j in range(len(order)): # for each order
            kstat_feats[:,j] = kstat(data[:,i],order[j])
            kstatvar_feats[:,0],kstatvar_feats[:,1] = kstatvar(data[:,i],1), kstatvar(data[:,i],2)
            kstats_feats = np.concatenate((kstat_feats, kstatvar_feats), axis = 1)
        k_order_feats_[:,ind[i]:ind[i]+6] = kstats_feats
#         print(range(ind[i], ind[i]+6,1))
    return k_order_feats_
    

In [3]:
# output will be calculated by a simple majority vote.
# ONLY THE LAST LABEL WILL BE KEPT 
def nodes_pred(X_test, nodes):
    Y_pred = [nodes[i].predict(X_test) for i in range(len(nodes))]
    Y_vote = np.zeros(X_test.shape[0])
    for i in range(X_test.shape[0]): # for every point in X_test 
        votes = np.zeros(2)
        for j in range(len(Y_pred)): # for every node 
            if (Y_pred[j][i]==1.):
                votes[1]+=1
            else:
                votes[0]+=1
            Y_vote[i] = np.argmax(votes)
    return Y_vote[-1]

In [4]:
datapath = 'tmp/features/1024_1/'
clfpath = 'tmp/'
forcesfile_sc = datapath + 'newprefeaturesfxyz_trans_1024_1_10_20000.npz'
newprefeat_sc = np.load(forcesfile_sc, encoding = 'latin1')['newprefeatf']
print (newprefeat_sc.shape,":",[fi.shape for fi in newprefeat_sc])

((49,), ':', [(2478, 4), (2478, 4), (2476, 4), (2464, 4), (1264, 4), (3889, 4), (2092, 4), (2019, 4), (3896, 4), (6116, 4), (6073, 4), (2447, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (2440, 4), (39998, 4), (39974, 4), (39984, 4), (39832, 4), (39926, 4), (39876, 4), (23980, 4), (20980, 4), (20480, 4), (20480, 4), (23480, 4), (23980, 4), (23980, 4), (23980, 4), (23980, 4), (23980, 4), (23980, 4), (26980, 4), (27480, 4), (27480, 4), (27480, 4)])


In [5]:
# %%time
from_d = 28
window, shift = 1024,10
order = [1,2,3,4]
datasets = range(from_d,len(newprefeat_sc))
total_scales = np.arange(1,10)
features = []
cwtdatafile = datapath + 'cwt_162_features_dataset.npz'

if not os.path.isfile(cwtdatafile):    
    for d in datasets:
        print(d)
        rows,cols = newprefeat_sc[d].shape[0], newprefeat_sc[d][:,:3].shape[1]*len(total_scales)
        coef_tot = np.zeros((rows,cols))
        direction_ind = np.arange(0, cols,len(total_scales))
        for i in range(newprefeat_sc[d][:,:3].shape[1]): #for each direction
            coefs, freq = pywt.cwt(newprefeat_sc[d][:,i],total_scales,'gaus1')
            coef_tot[:,direction_ind[i]:direction_ind[i]+len(total_scales)] = np.transpose(coefs)
        range1 = range(0,coef_tot.shape[0]-window,shift)
        X_final = np.ones((1,162))
        y_final = []
        for k in range1:
            data = coef_tot[k:k+window,:]
            label = newprefeat_sc[d][k+window-1,3]
            stat_feats = k_order_feats(data,order)
            X_final = np.append(X_final,stat_feats,axis = 0)
            y_final.append(label)
        X_final = X_final[1:,:]
        label_final = np.array(y_final)
        label_final = label_final[:,np.newaxis]
        data_final = np.concatenate((X_final,label_final),axis = 1)
        features.append(data_final)
    np.savez(cwtdatafile, cwt_datasets=features)
else:
    features = np.load(cwtdatafile)['cwt_datasets']
    
print(len(features))
print([features[i].shape for i in range(len(features))])

21
[(3898, 163), (3895, 163), (3896, 163), (3881, 163), (3891, 163), (3886, 163), (2296, 163), (1996, 163), (1946, 163), (1946, 163), (2246, 163), (2296, 163), (2296, 163), (2296, 163), (2296, 163), (2296, 163), (2296, 163), (2596, 163), (2646, 163), (2646, 163), (2646, 163)]


In [12]:
%%time
# Fit the different nodes
# from_d = 0 if you're gonna be using it as a finished structure
# features = deepcopy(cwt_data)
cols = features[0].shape[1]-1
from_d = 0
from_r = [int(np.ceil(0.1*features[i].shape[0])) for i in range(from_d,len(features))]
to_r = [int(np.ceil(0.9*features[i].shape[0])) for i in range(from_d,len(features))]

X_train = [features[i+from_d][from_r[i]:to_r[i],:cols] for i in range(len(from_r))]
Y_train = [features[i+from_d][from_r[i]:to_r[i],cols] for i in range(len(from_r))]

# clf = [SVC(kernel = 'rbf', C = 100) for i in range(len(X_train))]
print len(X_train)
clf = [KNeighborsClassifier(n_neighbors=7) for i in range(len(X_train))]
nodes = [clf[i].fit(X_train[i],Y_train[i]) for i in range(len(X_train))]

21
CPU times: user 152 ms, sys: 0 ns, total: 152 ms
Wall time: 149 ms


In [13]:
clfdatafile = clfpath + 'cwt_162_classifiers.npz'
if not os.path.isfile(clfdatafile):
    np.savez(clfdatafile, nodes = nodes)
else:
    nodes = np.load(clfdatafile)['nodes']

In [14]:
# --------- Example
# simulating the online procedure with input of size 1024
X_test = features[0][:,:cols]
Y_test = features[0][:,cols]
wind = 128
test_batches = np.arange(0,X_test.shape[0],wind)
x_batch = [X_test[batch:batch+wind,:] for batch in test_batches]
y_batch = [Y_test[batch:batch+wind] for batch in test_batches]
average_score = np.zeros(len(test_batches))

for batch_id,batch in enumerate(test_batches):
    Y_vot = nodes_pred(X_test = x_batch[batch_id],nodes = nodes)
    average_score[batch_id] = np.abs(Y_vot-y_batch[batch_id][-1]) 
#     print('Batch no:%d out of %d ' %(batch_id,len(test_batches)-1), end = ' ')
    print '\r Batch no:%d out of %d ' %(batch_id,len(test_batches)-1),
print('\n Average error for %d batches = %f' %(len(test_batches),np.mean(average_score)))

 Batch no:30 out of 30                          
 Average error for 31 batches = 0.064516


In [17]:
# pipelined nodes, using StandardScaler
# %%time

# features = deepcopy(cwt_data)
cols = features[0].shape[1]-1
from_d = 0
from_r = [int(np.ceil(0.1*features[i].shape[0])) for i in range(from_d,len(features))]
to_r = [int(np.ceil(0.9*features[i].shape[0])) for i in range(from_d,len(features))]

X_train = [features[i+from_d][from_r[i]:to_r[i],:cols] for i in range(len(from_r))]
Y_train = [features[i+from_d][from_r[i]:to_r[i],cols] for i in range(len(from_r))]
 
# clf = [SVC(kernel = 'rbf', C = 100) for i in range(len(X_train))]
clf = [KNeighborsClassifier(n_neighbors=7) for i in range(len(X_train))]
sca = [StandardScaler() for i in range(len(X_train))]
pipe = [Pipeline([('scaler', sca[i]), ('clf', clf[i])]) for i in range(len(X_train))]
piped_nodes = [clf[i].fit(X_train[i],Y_train[i]) for i in range(len(X_train))]


In [18]:
pipedatafile = clfpath + 'cwt_162_piped_classifiers.npz'
if not os.path.isfile(pipedatafile):
    np.savez(pipedatafile, nodes = piped_nodes)
else:
    nodes = np.load(pipedatafile)['nodes']

In [19]:
# --------- Example
# simulating the online procedure with input of size 1024
X_test = features[0][:,:cols]
Y_test = features[0][:,cols]
wind = 128
test_batches = np.arange(0,X_test.shape[0],wind)
x_batch = [X_test[batch:batch+wind,:] for batch in test_batches]
y_batch = [Y_test[batch:batch+wind] for batch in test_batches]
average_score = np.zeros(len(test_batches))

for batch_id,batch in enumerate(test_batches):
    Y_vot = nodes_pred(X_test = x_batch[batch_id],nodes = nodes)
    average_score[batch_id] = np.abs(Y_vot-y_batch[batch_id][-1]) 
#     print('Batch no:%d out of %d ' %(batch_id,len(test_batches)-1), end = ' ')
    print '\r Batch no:%d out of %d ' %(batch_id,len(test_batches)-1),
print('\n Average error for %d batches = %f' %(len(test_batches),np.mean(average_score)))

 Batch no:30 out of 30                          
 Average error for 31 batches = 0.064516
