In [1]:
from sklearn.svm import SVC
import pandas as pd              
import numpy as np  
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_validate, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import normalize, scale
from pprint import pprint

In [2]:
raw_data = pd.read_csv('final_data.csv')
raw_data.head()

Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,...,t493,t494,t495,t496,t497,t498,t499,t500,t501,y
0,0.019336,0.0,0.0,0.0,0.003223,0.0,0.0,0.0,0.0,0.0,...,0.029004,0.009668,0.012891,0.0,0.0,0.0,0.003223,0.003223,0.0,0
1,0.0,0.0,0.012891,0.0,0.016113,0.0,0.006445,0.0,0.003223,0.022559,...,0.0,0.0,0.0,0.009668,0.0,0.0,0.0,0.009668,0.0,0
2,0.0,0.009668,0.0,0.0,0.006445,0.012891,0.0,0.0,0.029004,0.025781,...,0.006445,0.003223,0.012891,0.0,0.0,0.0,0.0,0.003223,0.0,0
3,0.0,0.0,0.0,0.016113,0.006445,0.003223,0.0,0.022559,0.012891,0.0,...,0.0,0.003223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.016113,0.0,0.0,0.0,0.012891,0.0,0.0,0.003223,0.003223,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003223,0.0,0


In [3]:
x_train_val, x_test, y_train_val, y_test = train_test_split(raw_data.iloc[:,0:501], raw_data['y'],
                                                            test_size = 0.3, stratify = raw_data['y'], random_state = 7)

x_train, x_val, y_train, y_val = train_test_split(x_train_val.iloc[:,0:501],
                                                  y_train_val, test_size = 0.3, stratify = y_train_val, random_state = 667)

In [4]:
#seed = 17
#test_size = 0.3
#x_train, x_test, y_train, y_test = train_test_split(raw_data.iloc[:,0:501], raw_data.iloc[:,501], test_size=test_size, random_state=seed)

In [5]:
#SI Functions
def get_mean(x):
    avg = np.mean(x)
    return(avg)

def get_std(x):
    std = np.std(x)
    return(std)

def get_skewness(x):
    skewness = sum((x - np.mean(x))**3) / ((len(x)-1)*(np.std(x)**3))
    return(skewness)

def get_kurtosis(x):
    kurtosis = sum((x - np.mean(x))**4) / ((len(x)-1)*(np.std(x)**4))
    return (kurtosis)

def get_p2p(x):
    p2p = (np.max(x) - np.min(x))
    return(p2p)

def get_rms(x):
    rms = np.sqrt(sum(x**2) / len(x))
    return(rms)

def get_crestFactor(x):
    crestFactor = get_p2p(x) / get_rms(x)
    return(crestFactor)

def get_shapeFactor(x):
    shapeFactor = get_rms(x) / get_mean(x)
    return(shapeFactor)
    
def get_marginFactor(x):
    marginFactor = np.max(x) / (np.mean((np.sqrt(abs(x))))**2)
    return(marginFactor)

def get_impulseFactor(x):
    impulseFactor = np.max(x) / np.mean(abs(x))
    return(impulseFactor)

In [6]:
def make_df(x):
    mean_value = np.apply_along_axis(get_mean, 1, x)
    std_value = np.apply_along_axis(get_std, 1, x)
    skewness_value = np.apply_along_axis(get_skewness, 1, x)
    kurtosis_value = np.apply_along_axis(get_kurtosis, 1, x)
    p2p_value = np.apply_along_axis(get_p2p, 1, x)
    rms_value = np.apply_along_axis(get_rms, 1, x)
    crestFactor_value = np.apply_along_axis(get_crestFactor, 1, x)
    shapeFactor_value = np.apply_along_axis(get_shapeFactor, 1, x)
    marginFactor_value = np.apply_along_axis(get_marginFactor, 1, x)
    impulseFactor_value = np.apply_along_axis(get_impulseFactor, 1, x)
    
    df = pd.DataFrame({'mean':mean_value, 'std':std_value, 'skewness':skewness_value, 'kurtosis':kurtosis_value, 'p2p':p2p_value, 'rms':rms_value,
                       'crest':crestFactor_value, 'shape':shapeFactor_value, 'margin':marginFactor_value, 'impulse':impulseFactor_value})
    
    return(df)

In [7]:
x_train_val_new = make_df(x_train_val)

x_test_new = make_df(x_test)

In [8]:
x_test_new.shape

(33, 10)

In [9]:
x_train_new_normalized = pd.DataFrame(scale(x_train_val_new, axis = 0))
x_test_new_normalized = pd.DataFrame(scale(x_test_new, axis = 0))

In [10]:
x_train_new_normalized.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.951133,0.353702,-1.130353,-1.094965,-1.153599,0.421081,-1.889638,-0.326899,-1.281074,-1.484476
1,-0.647088,-0.590984,-0.057775,-0.040225,-0.075646,-0.615326,0.39906,-0.281889,0.047245,0.155482
2,1.139159,0.911221,0.048614,-0.168485,0.037823,0.952921,-0.440658,0.265153,-0.308842,-0.292441
3,-1.096262,-0.831377,-0.138833,-0.255571,-0.529521,-0.883618,-0.181223,-0.225799,-0.323395,-0.241846
4,0.930241,1.680981,1.635216,1.498049,1.68312,1.647842,1.315799,1.509899,1.445436,1.489904


In [11]:
model = SVC(C = 1.0, kernel = 'rbf', gamma = 'auto', probability = True, verbose = 2)

In [12]:
model.fit(x_train_new_normalized, y_train_val)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=2)

In [13]:
pred = model.predict(x_test_new_normalized)

In [14]:
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))
print('acc:', np.mean(y_test == pred))
print('f1:', f1_score(y_test, pred, pos_label=1))

             precision    recall  f1-score   support

          0       0.89      0.89      0.89        18
          1       0.87      0.87      0.87        15

avg / total       0.88      0.88      0.88        33

[[16  2]
 [ 2 13]]
acc: 0.8787878787878788
f1: 0.8666666666666667


In [15]:
random_grid = [{'kernel': ['rbf'], 'gamma': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
               'C': [0.001,0.1,1,10,25,50,100,1000]}, 
              {'kernel': ['sigmoid'], 'gamma': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
               'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
              {'kernel': ['linear'], 'C': [0.001, 0.1, 0.1, 10, 25, 50, 100, 1000]}]

pprint(random_grid)

[{'C': [0.001, 0.1, 1, 10, 25, 50, 100, 1000],
  'gamma': [0.1, 0.01, 0.001, 0.0001, 1e-05],
  'kernel': ['rbf']},
 {'C': [0.001, 0.1, 0.1, 10, 25, 50, 100, 1000],
  'gamma': [0.1, 0.01, 0.001, 0.0001, 1e-05],
  'kernel': ['sigmoid']},
 {'C': [0.001, 0.1, 0.1, 10, 25, 50, 100, 1000], 'kernel': ['linear']}]


In [16]:
svc_random = GridSearchCV(estimator = SVC(), param_grid = random_grid, scoring = 'accuracy', cv = 5, verbose=2, n_jobs = 4,
                          return_train_score = True)

In [17]:
svc_random.fit(x_train_new_normalized, y_train_val)

Fitting 5 folds for each of 88 candidates, totalling 440 fits
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=0.001, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV] ................... C=0.001, gamma=0.1, kernel=rbf, total=   0.0s
[CV] ................... C=0.001, gamma=0.1, kernel=rbf, total=   0.0s
[CV] ................... C=0.001, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] ................... C=0.001, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV] ..........

[Parallel(n_jobs=4)]: Done 440 out of 440 | elapsed:    2.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.1, 0.01, 0.001, 0.0001, 1e-05], 'C': [0.001, 0.1, 1, 10, 25, 50, 100, 1000]}, {'kernel': ['sigmoid'], 'gamma': [0.1, 0.01, 0.001, 0.0001, 1e-05], 'C': [0.001, 0.1, 0.1, 10, 25, 50, 100, 1000]}, {'kernel': ['linear'], 'C': [0.001, 0.1, 0.1, 10, 25, 50, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=2)

In [18]:
svc_random.best_params_

{'C': 50, 'gamma': 0.01, 'kernel': 'sigmoid'}

In [19]:
svc_random.best_score_

0.7866666666666666

In [20]:
pred = svc_random.best_estimator_.predict(x_test_new_normalized)

In [21]:
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))
print('acc:', np.mean(y_test == pred))
print('precision:', precision_score(y_test, pred, pos_label=1))
print('recall', recall_score(y_test, pred, pos_label=1))
print('f1:', f1_score(y_test, pred, pos_label=1))

             precision    recall  f1-score   support

          0       0.88      0.78      0.82        18
          1       0.76      0.87      0.81        15

avg / total       0.82      0.82      0.82        33

[[14  4]
 [ 2 13]]
acc: 0.8181818181818182
precision: 0.7647058823529411
recall 0.8666666666666667
f1: 0.8125


In [30]:
#cross validation
X = raw_data.iloc[:,:-1]
Y = raw_data.iloc[:,501]

In [31]:
X = make_df(X)

In [32]:
X = pd.DataFrame(scale(X, axis = 0))
Y = np.array(Y)

In [41]:
best_model = svc_random.best_estimator_
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=500)

cv_acc = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [42]:
for train, test in kfold.split(X, Y):
    # evaluate the model
    y_pred = best_model.predict(X.iloc[test,:])
    
    accuracy = np.mean(y_pred == Y[test])
    precision = precision_score(Y[test], y_pred, pos_label=1)
    recall = recall_score(Y[test], y_pred, pos_label=1)
    f_score = f1_score(Y[test], y_pred, pos_label=1)
    
    cv_acc.append(accuracy)
    cv_precision.append(precision)
    cv_recall.append(recall)
    cv_f1.append(f_score)
    
print('accuracy:', np.mean(cv_acc))
print('precision:', np.mean(cv_precision))
print('recall:', np.mean(cv_recall))
print('f1:', np.mean(cv_f1))

accuracy: 0.7978354978354978
precision: 0.7757575757575758
recall: 0.8200000000000001
f1: 0.7924963924963926
