<h1>Classification using SciKit Learn</h1>

<h3> SVM - Simple example</h3>

In [406]:
# http://scikit-learn.org/stable/modules/svm.html

from sklearn import svm

In [407]:
X = [[0,0], [1,1], [2,2], [3,3]]
Y = [0, 1, 2, 3]
clf = svm.SVC()
clf.fit(X, Y)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [408]:
dec = clf.decision_function([[1,1]])
dec.shape[1] # 4 classes: 4*3/2 = 6

6

In [409]:
print clf.predict([1.0, 1.0])

[1]


<h3>SVM test with features </h3>

In [410]:
import numpy as np
import pandas as pd
pd.set_option('html',False)
from os import path

In [411]:
P03_FEATURES = path.relpath('data/FEATURES_P03.csv')
LABEL = path.relpath('data/P03_DATA_WINDOW/P03_LABEL_L.csv')

df_features = pd.read_csv(P03_FEATURES, sep='\,')
df_label = pd.read_csv(LABEL, header=None, sep='\ ')

# Get equal lengths of dataframes
df_features = df_features[:len(df_label)]

In [412]:
from sklearn import cross_validation
from sklearn import svm

<h3> Divide data set into training and testing </h3>

In [413]:
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(df_features,df_label, test_size=0.2, random_state=0)

In [414]:
print X_train.shape, X_test.shape, Y_train.shape, Y_test.shape


(4968, 32) (1242, 32) (4968, 1) (1242, 1)


<h4> Train and test different SVM </h4>

In [415]:
clf = svm.SVC(kernel='linear', C=1.0).fit(X_train, Y_train[0].values)
print 'SVM - svc', clf.score(X_test, Y_test[0].values)

SVM - svc 0.789855072464


<h2> Feature selection </h2>

<h3>ExtraTreesClassifier </h3>

In [416]:
from sklearn.ensemble import ExtraTreesClassifier

X = df_features
Y = df_label
print X.shape

clf = ExtraTreesClassifier()
X_new = clf.fit(X, Y[0].values).transform(X)
print clf.feature_importances_  
X_new.shape

(6210, 32)
[ 0.01707725  0.02215986  0.02252281  0.0686356   0.05355639  0.02128536
  0.0125413   0.01111858  0.01005244  0.07092417  0.10463642  0.02590498
  0.01550018  0.01280134  0.01131701  0.06153016  0.05644013  0.02749644
  0.02497013  0.02118882  0.01480827  0.05509374  0.0643969   0.03089803
  0.01117129  0.00954307  0.01607083  0.0305417   0.03086641  0.02518846
  0.00849849  0.03126346]


(6210, 9)

In [417]:
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_new,df_label, test_size=0.2, random_state=0)


clf = svm.SVC(kernel='linear', C=1.0).fit(X_train, Y_train[0].values)
print 'SVM - svc', clf.score(X_test, Y_test[0].values)

SVM - svc 0.753623188406


<h3>Recursive Feature Elimination </h3>

In [418]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 5)
rfe = rfe.fit(df_features, df_label[0].values)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False False False False False False False False  True False False
 False False False  True  True False False False False False False False
  True False False False False  True False False]
[10  9 24 13 15 25 22 16 21  1  2  3 26 23 18  1  1  4 17 20 11  6 12 19  1
  7 14  5  8  1 28 27]


In [419]:
# Extract the new features
X_new =  df_features[[i for i, x in enumerate(rfe.support_) if x]]


X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_new,df_label, test_size=0.2, random_state=0)
X_new.shape

clf = svm.SVC(kernel='linear', C=1.0).fit(X_train, Y_train[0].values)
print 'SVM - svc', clf.score(X_test, Y_test[0].values)

SVM - svc 0.713365539452


<h4>Test on one activity - Walking</h4>

In [420]:
df_walking_label = df_label

activity = [6,7,8,10,14]
# Change the labeling. Walking will have 1, others will have its own value
for i in range(1,15):
    if i not in activity:
        df_walking_label.loc[df_label[0] == i] = 0
    else:
        df_walking_label.loc[df_label[0] == i] = 1
    
        
model = LogisticRegression()
rfe = RFE(model,5)
rfe = rfe.fit(df_features, df_walking_label[0].values)

# Extract the new features
X_new =  df_features[[i for i, x in enumerate(rfe.support_) if x]]
print X_new.head(1)

X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_new,df_walking_label, test_size=0.2, random_state=0)


clf = svm.SVC(kernel='linear', C=1.0).fit(X_train, Y_train[0].values)
print 'SVM - svc', clf.score(X_test, Y_test[0].values)

   min_chest_x  min_chest_z  max_chest_y  std_chest_y  std_chest_z
0       -0.648       -0.408        1.276       0.1287     0.148089
SVM - svc 0.834943639291
