In [21]:
import pandas as pd              
import numpy as np  
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_validate, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from pprint import pprint

In [22]:
raw_data = pd.read_csv('final_data.csv')
raw_data.head()

Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,...,t493,t494,t495,t496,t497,t498,t499,t500,t501,y
0,0.019336,0.0,0.0,0.0,0.003223,0.0,0.0,0.0,0.0,0.0,...,0.029004,0.009668,0.012891,0.0,0.0,0.0,0.003223,0.003223,0.0,0
1,0.0,0.0,0.012891,0.0,0.016113,0.0,0.006445,0.0,0.003223,0.022559,...,0.0,0.0,0.0,0.009668,0.0,0.0,0.0,0.009668,0.0,0
2,0.0,0.009668,0.0,0.0,0.006445,0.012891,0.0,0.0,0.029004,0.025781,...,0.006445,0.003223,0.012891,0.0,0.0,0.0,0.0,0.003223,0.0,0
3,0.0,0.0,0.0,0.016113,0.006445,0.003223,0.0,0.022559,0.012891,0.0,...,0.0,0.003223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.016113,0.0,0.0,0.0,0.012891,0.0,0.0,0.003223,0.003223,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003223,0.0,0


In [23]:
x_train_val, x_test, y_train_val, y_test = train_test_split(raw_data.iloc[:,0:501], raw_data['y'],
                                                            test_size = 0.3, stratify = raw_data['y'], random_state = 7)

x_train, x_val, y_train, y_val = train_test_split(x_train_val.iloc[:,0:501],
                                                  y_train_val, test_size = 0.3, stratify = y_train_val, random_state = 667)

In [4]:
#seed = 17
#test_size = 0.3
#x_train, x_test, y_train, y_test = train_test_split(raw_data.iloc[:,0:501], raw_data.iloc[:,501], test_size=test_size, random_state=seed)

In [5]:
'''
for i in range(0, x_train.shape[0]):
    x_train.iloc[i,:] = (x_train.iloc[i,:] - min(x_train.iloc[i,:])) / (max(x_train.iloc[i,:]) - min(x_train.iloc[i,:]))
    
for i in range(0, x_test.shape[0]):
    x_test.iloc[i,:] = (x_test.iloc[i,:] - min(x_test.iloc[i,:])) / (max(x_test.iloc[i,:]) - min(x_test.iloc[i,:]))
'''

'\nfor i in range(0, x_train.shape[0]):\n    x_train.iloc[i,:] = (x_train.iloc[i,:] - min(x_train.iloc[i,:])) / (max(x_train.iloc[i,:]) - min(x_train.iloc[i,:]))\n    \nfor i in range(0, x_test.shape[0]):\n    x_test.iloc[i,:] = (x_test.iloc[i,:] - min(x_test.iloc[i,:])) / (max(x_test.iloc[i,:]) - min(x_test.iloc[i,:]))\n'

In [24]:
#SI Functions
def get_mean(x):
    avg = np.mean(x)
    return(avg)

def get_std(x):
    std = np.std(x)
    return(std)

def get_skewness(x):
    skewness = sum((x - np.mean(x))**3) / ((len(x)-1)*(np.std(x)**3))
    return(skewness)

def get_kurtosis(x):
    kurtosis = sum((x - np.mean(x))**4) / ((len(x)-1)*(np.std(x)**4))
    return (kurtosis)

def get_p2p(x):
    p2p = (np.max(x) - np.min(x))
    return(p2p)

def get_rms(x):
    rms = np.sqrt(sum(x**2) / len(x))
    return(rms)

def get_crestFactor(x):
    crestFactor = get_p2p(x) / get_rms(x)
    return(crestFactor)

def get_shapeFactor(x):
    shapeFactor = get_rms(x) / get_mean(x)
    return(shapeFactor)
    
def get_marginFactor(x):
    marginFactor = np.max(x) / (np.mean((np.sqrt(abs(x))))**2)
    return(marginFactor)

def get_impulseFactor(x):
    impulseFactor = np.max(x) / np.mean(abs(x))
    return(impulseFactor)

In [25]:
def make_df(x):
    mean_value = np.apply_along_axis(get_mean, 1, x)
    std_value = np.apply_along_axis(get_std, 1, x)
    skewness_value = np.apply_along_axis(get_skewness, 1, x)
    kurtosis_value = np.apply_along_axis(get_kurtosis, 1, x)
    p2p_value = np.apply_along_axis(get_p2p, 1, x)
    rms_value = np.apply_along_axis(get_rms, 1, x)
    crestFactor_value = np.apply_along_axis(get_crestFactor, 1, x)
    shapeFactor_value = np.apply_along_axis(get_shapeFactor, 1, x)
    marginFactor_value = np.apply_along_axis(get_marginFactor, 1, x)
    impulseFactor_value = np.apply_along_axis(get_impulseFactor, 1, x)
    
    df = pd.DataFrame({'mean':mean_value, 'std':std_value, 'skewness':skewness_value, 'kurtosis':kurtosis_value, 'p2p':p2p_value, 'rms':rms_value,
                       'crest':crestFactor_value, 'shape':shapeFactor_value, 'margin':marginFactor_value, 'impulse':impulseFactor_value})
    
    return(df)

In [26]:
x_train_new = make_df(x_train_val)
x_test_new = make_df(x_test)

In [27]:
x_train_new.head()

Unnamed: 0,mean,std,skewness,kurtosis,p2p,rms,crest,shape,margin,impulse
0,0.007468,0.016947,3.939118,21.23369,0.122461,0.018519,6.612552,2.479819,52.497783,16.397933
1,0.006484,0.01478,5.486455,50.156291,0.183691,0.01614,11.381387,2.489183,86.497434,28.330357
2,0.007584,0.018226,5.639935,46.639213,0.190137,0.019741,9.631712,2.60299,77.383032,25.071247
3,0.006207,0.014229,5.369518,44.251181,0.15791,0.015524,10.172284,2.500852,77.010524,25.439378
4,0.007455,0.019992,7.92882,92.338183,0.283594,0.021336,13.291545,2.861946,122.285505,38.039689


In [28]:
model = rf()
model.fit(x_train_new, y_train_val)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [29]:
y_pred = model.predict(x_test_new)

In [30]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.84      0.89      0.86        18
          1       0.86      0.80      0.83        15

avg / total       0.85      0.85      0.85        33



In [31]:
n_estimators = [25,50,150,250,350]

# Number of features to consider at every split
max_features = ['auto','sqrt']

# Maximum number of levels in tree
max_depth = [3,5,7,9,11,13]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2,5,10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True,False]

criterion = ['gini','entropy']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               "criterion": criterion}

pprint(random_grid)

{'bootstrap': [True, False],
 'criterion': ['gini', 'entropy'],
 'max_depth': [3, 5, 7, 9, 11, 13, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [25, 50, 150, 250, 350]}


In [32]:
#rf_random = GridSearchCV(estimator = model, param_grid = random_grid, scoring = 'accuracy',
#                         cv = 5, verbose=2, n_jobs = 4)

rf_random = RandomizedSearchCV(estimator = rf(), param_distributions = random_grid, n_iter=1000, scoring = 'f1', cv = 5, verbose=2, n_jobs = 4,
                               random_state = None, return_train_score = True)

In [33]:
rf_random.fit(x_train_new, y_train_val)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
[CV] n_estimators=50, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=7, criterion=gini, bootstrap=True 
[CV] n_estimators=50, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=7, criterion=gini, bootstrap=True 
[CV] n_estimators=50, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=7, criterion=gini, bootstrap=True 
[CV] n_estimators=50, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=7, criterion=gini, bootstrap=True 
[CV]  n_estimators=50, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=7, criterion=gini, bootstrap=True, total=   0.1s
[CV] n_estimators=50, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=7, criterion=gini, bootstrap=True 
[CV]  n_estimators=50, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=7, criterion=gini, bootstrap=True, total=   0.1s
[CV]  n_e

[Parallel(n_jobs=4)]: Done  47 tasks      | elapsed:    7.0s


[CV]  n_estimators=150, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=3, criterion=gini, bootstrap=False, total=   0.3s
[CV] n_estimators=150, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=3, criterion=gini, bootstrap=False 
[CV]  n_estimators=150, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=3, criterion=gini, bootstrap=False, total=   0.3s
[CV] n_estimators=150, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=3, criterion=gini, bootstrap=False 
[CV]  n_estimators=250, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=9, criterion=entropy, bootstrap=False, total=   0.5s
[CV] n_estimators=350, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, criterion=gini, bootstrap=True 
[CV]  n_estimators=150, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=3, criterion=gini, bootstrap=False, total=   0.3s
[CV] n_estimators=350, min_samp

[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   17.8s


[CV]  n_estimators=50, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=9, criterion=gini, bootstrap=True, total=   0.1s
[CV] n_estimators=350, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=None, criterion=gini, bootstrap=False 
[CV]  n_estimators=350, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=9, criterion=entropy, bootstrap=False, total=   0.7s
[CV] n_estimators=50, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=9, criterion=gini, bootstrap=True 
[CV]  n_estimators=350, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=9, criterion=entropy, bootstrap=False, total=   0.7s
[CV] n_estimators=350, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=None, criterion=gini, bootstrap=False 
[CV]  n_estimators=50, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=9, criterion=gini, bootstrap=True, total=   0.1s
[CV] n_estimators=250, min

[Parallel(n_jobs=4)]: Done 439 tasks      | elapsed:   40.4s


[CV]  n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, criterion=entropy, bootstrap=False, total=   0.3s
[CV] n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, criterion=entropy, bootstrap=False 
[CV]  n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, criterion=entropy, bootstrap=False, total=   0.3s
[CV] n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, criterion=entropy, bootstrap=False 
[CV]  n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, criterion=entropy, bootstrap=False, total=   0.3s
[CV] n_estimators=150, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=9, criterion=entropy, bootstrap=True 
[CV]  n_estimators=250, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=11, criterion=gini, bootstrap=False, total=   0.5s
[CV

[Parallel(n_jobs=4)]: Done 730 tasks      | elapsed:  1.1min


[CV]  n_estimators=250, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=7, criterion=gini, bootstrap=True, total=   0.5s
[CV] n_estimators=150, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=11, criterion=entropy, bootstrap=False 
[CV]  n_estimators=150, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=11, criterion=entropy, bootstrap=False, total=   0.3s
[CV] n_estimators=150, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=11, criterion=entropy, bootstrap=False 
[CV]  n_estimators=250, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=7, criterion=gini, bootstrap=True, total=   0.5s
[CV] n_estimators=150, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=11, criterion=entropy, bootstrap=False 
[CV]  n_estimators=150, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=11, criterion=entropy, bootstrap=False, total=   0.3s
[CV] n_estimators=5

[Parallel(n_jobs=4)]: Done 1289 tasks      | elapsed:  1.8min


[CV]  n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=9, criterion=entropy, bootstrap=False, total=   0.3s
[CV] n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=9, criterion=entropy, bootstrap=False 
[CV]  n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=9, criterion=entropy, bootstrap=False, total=   0.3s
[CV] n_estimators=50, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=11, criterion=gini, bootstrap=True 
[CV]  n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=9, criterion=entropy, bootstrap=False, total=   0.3s
[CV] n_estimators=50, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=11, criterion=gini, bootstrap=True 
[CV]  n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=9, criterion=entropy, bootstrap=False, total=   0.3s
[CV] n_estimators=50, mi

[Parallel(n_jobs=4)]: Done 1734 tasks      | elapsed:  2.5min


[CV]  n_estimators=150, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=9, criterion=gini, bootstrap=False, total=   0.3s
[CV] n_estimators=350, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=13, criterion=entropy, bootstrap=True 
[CV]  n_estimators=350, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=13, criterion=entropy, bootstrap=True, total=   0.7s
[CV] n_estimators=350, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=13, criterion=entropy, bootstrap=True 
[CV]  n_estimators=350, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=13, criterion=entropy, bootstrap=True, total=   0.7s
[CV] n_estimators=25, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=None, criterion=gini, bootstrap=False 
[CV]  n_estimators=350, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=13, criterion=entropy, bootstrap=True, total=   0.7s
[CV] n_estimato

[Parallel(n_jobs=4)]: Done 2304 tasks      | elapsed:  3.4min


[CV]  n_estimators=350, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True, total=   0.7s
[CV]  n_estimators=350, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True, total=   0.7s
[CV] n_estimators=350, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True 
[CV] n_estimators=50, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True 
[CV]  n_estimators=350, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True, total=   0.7s
[CV] n_estimators=50, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True 
[CV]  n_estimators=350, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True, total=   0.7s
[CV] n_estimators=50

[Parallel(n_jobs=4)]: Done 2981 tasks      | elapsed:  4.5min


[CV]  n_estimators=50, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=7, criterion=gini, bootstrap=False, total=   0.1s
[CV] n_estimators=250, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=3, criterion=gini, bootstrap=False 
[CV]  n_estimators=50, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=7, criterion=gini, bootstrap=False, total=   0.1s
[CV] n_estimators=250, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=3, criterion=gini, bootstrap=False 
[CV]  n_estimators=50, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=7, criterion=gini, bootstrap=False, total=   0.1s
[CV] n_estimators=250, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=3, criterion=gini, bootstrap=False 
[CV]  n_estimators=350, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=11, criterion=gini, bootstrap=True, total=   0.7s
[CV] n_estimators=250, min_samples_spli

[Parallel(n_jobs=4)]: Done 3766 tasks      | elapsed:  5.7min


[CV]  n_estimators=150, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=9, criterion=gini, bootstrap=False, total=   0.3s
[CV] n_estimators=350, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=11, criterion=gini, bootstrap=False 
[CV]  n_estimators=350, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=11, criterion=gini, bootstrap=False, total=   0.7s
[CV] n_estimators=350, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=11, criterion=gini, bootstrap=False 
[CV]  n_estimators=350, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=11, criterion=gini, bootstrap=False, total=   0.7s
[CV] n_estimators=350, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=11, criterion=gini, bootstrap=True 
[CV]  n_estimators=350, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=11, criterion=gini, bootstrap=False, total=   0.7s
[CV] n_estimators=350, min_sampl

[Parallel(n_jobs=4)]: Done 4698 tasks      | elapsed:  7.0min


[CV]  n_estimators=250, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=9, criterion=entropy, bootstrap=False, total=   0.5s
[CV] n_estimators=250, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=13, criterion=gini, bootstrap=True 
[CV]  n_estimators=250, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=13, criterion=gini, bootstrap=True, total=   0.5s
[CV] n_estimators=250, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=13, criterion=gini, bootstrap=True 
[CV]  n_estimators=250, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=13, criterion=gini, bootstrap=True, total=   0.5s
[CV] n_estimators=150, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=9, criterion=gini, bootstrap=False 
[CV]  n_estimators=250, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=13, criterion=gini, bootstrap=True, total=   0.5s
[CV] n_estimators=150, min_sa

[Parallel(n_jobs=4)]: Done 5000 out of 5000 | elapsed:  7.4min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=1000, n_jobs=4,
          param_distributions={'n_estimators': [25, 50, 150, 250, 350], 'max_features': ['auto', 'sqrt'], 'max_depth': [3, 5, 7, 9, 11, 13, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='f1', verbose=2)

In [34]:
print(rf_random.best_params_)
print(rf_random.best_score_)

{'n_estimators': 350, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 9, 'criterion': 'gini', 'bootstrap': True}
0.8111904761904761


In [35]:
model_tuned = rf_random.best_estimator_

In [36]:
rf_random_pred = model_tuned.predict(x_test_new)

In [39]:
print(classification_report(y_test, rf_random_pred))
print(confusion_matrix(y_test, rf_random_pred))
print('acc:', np.mean(y_test == rf_random_pred))
print(precision_score(y_test, rf_random_pred, pos_label=1))
print(recall_score(y_test, rf_random_pred, pos_label=1))
print('f1:', f1_score(y_test, rf_random_pred, pos_label=1))

             precision    recall  f1-score   support

          0       0.83      0.83      0.83        18
          1       0.80      0.80      0.80        15

avg / total       0.82      0.82      0.82        33

[[15  3]
 [ 3 12]]
acc: 0.8181818181818182
0.8
0.8
f1: 0.8000000000000002


In [20]:
rf_random_pred

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0])

In [359]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=100)

In [360]:
scores = cross_validate(model_tuned, raw_data.iloc[:,:-1], raw_data.iloc[:,501], cv=cv, scoring=['accuracy','precision','recall','f1'],return_train_score=False)

In [361]:
cv_acc = scores['test_accuracy']
cv_precision = scores['test_precision']
cv_recall = scores['test_recall']
cv_f1 = scores['test_f1']

In [367]:
i=20
print('acc:', np.mean(cv_acc[i:i+5]))
print('precision:', np.mean(cv_precision[i:i+5]))
print('recall:', np.mean(cv_recall[i:i+5]))
print('f1:', np.mean(cv_f1[i:i+5]))

acc: 0.9541125541125541
precision: 0.9484848484848485
recall: 0.96
f1: 0.9500721500721501


In [None]:
# cross validation

In [57]:
X = raw_data.iloc[:,:-1]
Y = raw_data.iloc[:,501]

In [58]:
X = make_df(X)

#X = np.array(X)
#Y = np.array(Y)

In [59]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state = 500)

cv_acc = []
cv_precision = []
cv_recall = []
cv_f1 = []

model_tuned = rf_random.best_estimator_

In [60]:
for train, test in kfold.split(X, Y):

    rf_random_pred = model_tuned.predict(X.iloc[test,:])

    # evaluate the model
    y_pred = model_tuned.predict(X.iloc[test,:])
    
    accuracy = np.mean(y_pred == Y[test])
    precision = precision_score(Y[test], y_pred, pos_label=1)
    recall = recall_score(Y[test], y_pred, pos_label=1)
    f_score = f1_score(Y[test], y_pred, pos_label=1)
    
    cv_acc.append(accuracy)
    cv_precision.append(precision)
    cv_recall.append(recall)
    cv_f1.append(f_score)
    
print('accuracy:', np.mean(cv_acc))
print('precision:', np.mean(cv_precision))
print('recall:', np.mean(cv_recall))
print('f1:', np.mean(cv_f1))

accuracy: 0.9077922077922078
precision: 0.9277777777777778
recall: 0.8800000000000001
f1: 0.8993088782562468


In [40]:
X

Unnamed: 0,crest,impulse,kurtosis,margin,mean,p2p,rms,shape,skewness,std
0,9.822933,25.110752,39.841453,75.281066,0.007957,0.199805,0.020341,2.556340,5.224752,0.018720
1,10.923092,30.824766,55.507405,104.700793,0.008259,0.254590,0.023307,2.821982,6.278329,0.021795
2,9.215149,23.366798,31.217990,72.609212,0.008137,0.190137,0.020633,2.535694,4.603070,0.018961
3,8.285024,19.904480,28.185597,61.963525,0.007610,0.151465,0.018282,2.402465,4.338647,0.016623
4,8.184125,19.704602,23.383295,60.075181,0.007687,0.151465,0.018507,2.407661,3.974591,0.016835
5,10.476372,25.357362,40.956008,74.000344,0.008388,0.212695,0.020302,2.420433,5.024565,0.018489
6,11.035473,26.032353,50.523167,76.035001,0.006561,0.170801,0.015477,2.358970,5.428828,0.014018
7,11.409827,27.621673,49.203569,86.710314,0.006767,0.186914,0.016382,2.420867,5.234736,0.014919
8,6.155239,14.342547,17.584541,44.698468,0.006516,0.093457,0.015183,2.330136,3.499562,0.013714
9,11.374706,27.979979,51.022860,87.131118,0.006104,0.170801,0.015016,2.459842,5.516595,0.013719
