In [8]:
import pandas as pd              
import numpy as np  
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_validate, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import normalize, scale
from pprint import pprint

In [22]:
raw_data = pd.read_csv('final_data.csv')
raw_data.head()

Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,...,t493,t494,t495,t496,t497,t498,t499,t500,t501,y
0,0.019336,0.0,0.0,0.0,0.003223,0.0,0.0,0.0,0.0,0.0,...,0.029004,0.009668,0.012891,0.0,0.0,0.0,0.003223,0.003223,0.0,0
1,0.0,0.0,0.012891,0.0,0.016113,0.0,0.006445,0.0,0.003223,0.022559,...,0.0,0.0,0.0,0.009668,0.0,0.0,0.0,0.009668,0.0,0
2,0.0,0.009668,0.0,0.0,0.006445,0.012891,0.0,0.0,0.029004,0.025781,...,0.006445,0.003223,0.012891,0.0,0.0,0.0,0.0,0.003223,0.0,0
3,0.0,0.0,0.0,0.016113,0.006445,0.003223,0.0,0.022559,0.012891,0.0,...,0.0,0.003223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.016113,0.0,0.0,0.0,0.012891,0.0,0.0,0.003223,0.003223,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003223,0.0,0


In [23]:
#seed = 567
#test_size = 0.2
#x_train, x_test, y_train, y_test = train_test_split(raw_data.iloc[:,0:501], raw_data.iloc[:,501], test_size=test_size, random_state=seed)

In [24]:
'''
for i in range(0, x_train.shape[0]):
    x_train.iloc[i,:] = (x_train.iloc[i,:] - min(x_train.iloc[i,:])) / (max(x_train.iloc[i,:]) - min(x_train.iloc[i,:]))
    
for i in range(0, x_test.shape[0]):
    x_test.iloc[i,:] = (x_test.iloc[i,:] - min(x_test.iloc[i,:])) / (max(x_test.iloc[i,:]) - min(x_test.iloc[i,:]))
'''

'\nfor i in range(0, x_train.shape[0]):\n    x_train.iloc[i,:] = (x_train.iloc[i,:] - min(x_train.iloc[i,:])) / (max(x_train.iloc[i,:]) - min(x_train.iloc[i,:]))\n    \nfor i in range(0, x_test.shape[0]):\n    x_test.iloc[i,:] = (x_test.iloc[i,:] - min(x_test.iloc[i,:])) / (max(x_test.iloc[i,:]) - min(x_test.iloc[i,:]))\n'

In [25]:
#SI Functions
def get_mean(x):
    avg = np.mean(x)
    return(avg)

def get_std(x):
    std = np.std(x)
    return(std)

def get_skewness(x):
    skewness = sum((x - np.mean(x))**3) / ((len(x)-1)*(np.std(x)**3))
    return(skewness)

def get_kurtosis(x):
    kurtosis = sum((x - np.mean(x))**4) / ((len(x)-1)*(np.std(x)**4))
    return (kurtosis)

def get_p2p(x):
    p2p = (np.max(x) - np.min(x))
    return(p2p)

def get_rms(x):
    rms = np.sqrt(sum(x**2) / len(x))
    return(rms)

def get_crestFactor(x):
    crestFactor = get_p2p(x) / get_rms(x)
    return(crestFactor)

def get_shapeFactor(x):
    shapeFactor = get_rms(x) / get_mean(x)
    return(shapeFactor)
    
def get_marginFactor(x):
    marginFactor = np.max(x) / (np.mean((np.sqrt(abs(x))))**2)
    return(marginFactor)

def get_impulseFactor(x):
    impulseFactor = np.max(x) / np.mean(abs(x))
    return(impulseFactor)

In [26]:
def make_df(x):
    mean_value = np.apply_along_axis(get_mean, 1, x)
    std_value = np.apply_along_axis(get_std, 1, x)
    skewness_value = np.apply_along_axis(get_skewness, 1, x)
    kurtosis_value = np.apply_along_axis(get_kurtosis, 1, x)
    p2p_value = np.apply_along_axis(get_p2p, 1, x)
    rms_value = np.apply_along_axis(get_rms, 1, x)
    crestFactor_value = np.apply_along_axis(get_crestFactor, 1, x)
    shapeFactor_value = np.apply_along_axis(get_shapeFactor, 1, x)
    marginFactor_value = np.apply_along_axis(get_marginFactor, 1, x)
    impulseFactor_value = np.apply_along_axis(get_impulseFactor, 1, x)
    
    df = pd.DataFrame({'mean':mean_value, 'std':std_value, 'skewness':skewness_value, 'kurtosis':kurtosis_value, 'p2p':p2p_value, 'rms':rms_value,
                       'crest':crestFactor_value, 'shape':shapeFactor_value, 'margin':marginFactor_value, 'impulse':impulseFactor_value})
    
    return(df)

In [27]:
raw_data_new = make_df(raw_data)

raw_data_new_normalized = pd.DataFrame(scale(raw_data_new.iloc[:,:-1], axis = 0))
raw_data_new_normalized['y'] = raw_data['y']

raw_data_new_normalized.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,y
0,0.153121,-0.752439,-0.890397,-0.934294,-0.863241,-0.742778,-0.888503,-0.851117,-0.904444,0
1,0.49434,-0.555918,-0.738381,-0.844713,-0.730736,-0.548446,-0.695689,-0.684841,-0.750537,0
2,0.3564,-0.737027,-0.980061,-0.983588,-0.886624,-0.723624,-0.995024,-0.86404,-0.918422,0
3,-0.238917,-0.886445,-1.018223,-1.000933,-0.980157,-0.877637,-1.158038,-0.947433,-0.974115,0
4,-0.151797,-0.87286,-1.070723,-1.028381,-0.980157,-0.862873,-1.175722,-0.94418,-0.983993,0


In [28]:
seed = 7
test_size = 0.3
x_train, x_test, y_train, y_test = train_test_split(raw_data_new_normalized.iloc[:,0:10], raw_data_new_normalized['y'], test_size=test_size, random_state=seed)

In [29]:
model = rf()
model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
y_pred = model.predict(x_test)

In [31]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        19
          1       1.00      1.00      1.00        14

avg / total       1.00      1.00      1.00        33



In [32]:
n_estimators = [25,50,150,250,350]

# Number of features to consider at every split
max_features = ['auto','sqrt']

# Maximum number of levels in tree
max_depth = [3,5,7,9,11,13]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2,5,10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True,False]

criterion = ['gini','entropy']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               "criterion": criterion}

pprint(random_grid)

{'bootstrap': [True, False],
 'criterion': ['gini', 'entropy'],
 'max_depth': [3, 5, 7, 9, 11, 13, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [25, 50, 150, 250, 350]}


In [33]:
#rf_random = GridSearchCV(estimator = model, param_grid = random_grid, scoring = 'accuracy',
#                         cv = 5, verbose=2, n_jobs = 4)

rf_random = RandomizedSearchCV(estimator = rf(), param_distributions = random_grid, n_iter=500, scoring = 'f1', cv = 5, verbose=2, n_jobs = 4,
                               random_state = None, return_train_score = True)

In [None]:
rf_random.fit(x_train, y_train)

In [18]:
print(rf_random.best_params_)
print(rf_random.best_score_)

{'n_estimators': 25, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 9, 'criterion': 'entropy', 'bootstrap': True}
1.0


In [19]:
model_tuned = rf_random.best_estimator_

In [20]:
rf_random_pred = model_tuned.predict(x_test)

In [21]:
print(classification_report(y_test, rf_random_pred))
print(confusion_matrix(y_test, rf_random_pred))
print('acc:', np.mean(y_test == rf_random_pred))
print('f1:', f1_score(y_test, rf_random_pred, pos_label=1))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        22
          1       1.00      1.00      1.00        11

avg / total       1.00      1.00      1.00        33

[[22  0]
 [ 0 11]]
acc: 1.0
f1: 1.0


In [18]:
print(classification_report(y_test, rf_random_pred))
print(confusion_matrix(y_test, rf_random_pred))
print('acc:', np.mean(y_test == rf_random_pred))
print('f1:', f1_score(y_test, rf_random_pred, pos_label=1))

             precision    recall  f1-score   support

          0       0.89      0.53      0.67        15
          1       0.46      0.86      0.60         7

avg / total       0.75      0.64      0.65        22

[[8 7]
 [1 6]]
acc: 0.6363636363636364
f1: 0.6


In [19]:
rf_random_pred

array([1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1])

In [None]:
'''

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=100)

scores = cross_validate(model_tuned, raw_data.iloc[:,:-1], raw_data.iloc[:,501], cv=cv, scoring=['accuracy','precision','recall','f1'],return_train_score=False)

cv_acc = scores['test_accuracy']
cv_precision = scores['test_precision']
cv_recall = scores['test_recall']
cv_f1 = scores['test_f1']

i=20
print('acc:', np.mean(cv_acc[i:i+5]))
print('precision:', np.mean(cv_precision[i:i+5]))
print('recall:', np.mean(cv_recall[i:i+5]))
print('f1:', np.mean(cv_f1[i:i+5]))

# cross validation

X = raw_data.iloc[:,:-1]
Y = raw_data.iloc[:,501]

X = make_df(X)

#X = np.array(X)
#Y = np.array(Y)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state = 500)

cv_acc = []
cv_precision = []
cv_recall = []
cv_f1 = []

model_tuned = rf_random.best_estimator_

for train, test in kfold.split(X, Y):

    rf_random_pred = model_tuned.predict(X.iloc[test,:])

    # evaluate the model
    y_pred = model_tuned.predict(X.iloc[test,:])
    
    accuracy = np.mean(y_pred == Y[test])
    precision = precision_score(Y[test], y_pred, pos_label=1)
    recall = recall_score(Y[test], y_pred, pos_label=1)
    f_score = f1_score(Y[test], y_pred, pos_label=1)
    
    cv_acc.append(accuracy)
    cv_precision.append(precision)
    cv_recall.append(recall)
    cv_f1.append(f_score)
    
print('accuracy:', np.mean(cv_acc))
print('precision:', np.mean(cv_precision))
print('recall:', np.mean(cv_recall))
print('f1:', np.mean(cv_f1))

'''