# Evaluate *tsfresh* selected features with Random Forest classifier

### Cross validation

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier

def create_rf(n_trees):
    return RandomForestClassifier(n_estimators=n_trees)

In [2]:
ids = pd.read_csv('csv/ids.csv')
ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   number  55 non-null     object
 1   ill     55 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1008.0+ bytes


In [3]:
names = ['all', 'night', 'day']
selected = {}
hourly_selected = {}

for n in names:
    selected[n] = pd.read_csv(f'csv/selected_{n}.csv')
    hourly_selected[n] = pd.read_csv(f'csv/selected_hourly_{n}.csv')

In [4]:
from sklearn.model_selection import cross_val_score

FOLDS = [5, 10]
TREES = [100, 500, 1000]
y = ids['ill']

def cross_validate(name: str, data: pd.DataFrame):
    for t in TREES:
        for f in FOLDS:
            rf = create_rf(t)
            print(f'[{name}] {f}-fold cross validation with Random Forest Classifier ({t} trees):')
            scores = cross_val_score(rf, data, y, cv=f)
            print(f'Results:\n\tMin: {min(scores)}\n\tMax: {max(scores)}\n\tMean: {np.mean(scores)}\n\tStd: {np.std(scores)}')

In [5]:
for name, data in selected.items():
    cross_validate(name, data)

[all] 5-fold cross validation with Random Forest Classifier (100 trees):
Results:
	Min: 0.7272727272727273
	Max: 0.9090909090909091
	Mean: 0.8545454545454545
	Std: 0.0727272727272727
[all] 10-fold cross validation with Random Forest Classifier (100 trees):
Results:
	Min: 0.4
	Max: 1.0
	Mean: 0.8333333333333334
	Std: 0.18135294011647257
[all] 5-fold cross validation with Random Forest Classifier (500 trees):
Results:
	Min: 0.7272727272727273
	Max: 0.9090909090909091
	Mean: 0.8363636363636363
	Std: 0.06803013430498073
[all] 10-fold cross validation with Random Forest Classifier (500 trees):
Results:
	Min: 0.6
	Max: 1.0
	Mean: 0.8533333333333333
	Std: 0.13840359661351131
[all] 5-fold cross validation with Random Forest Classifier (1000 trees):
Results:
	Min: 0.7272727272727273
	Max: 0.9090909090909091
	Mean: 0.8363636363636363
	Std: 0.06803013430498073
[all] 10-fold cross validation with Random Forest Classifier (1000 trees):
Results:
	Min: 0.6
	Max: 1.0
	Mean: 0.8533333333333333
	Std: 0.

In [6]:
for name, data in hourly_selected.items():
    cross_validate(name, data)

[all] 5-fold cross validation with Random Forest Classifier (100 trees):
Results:
	Min: 0.6363636363636364
	Max: 0.9090909090909091
	Mean: 0.7454545454545455
	Std: 0.10601730717900545
[all] 10-fold cross validation with Random Forest Classifier (100 trees):
Results:
	Min: 0.4
	Max: 1.0
	Mean: 0.7433333333333333
	Std: 0.18917951498216948
[all] 5-fold cross validation with Random Forest Classifier (500 trees):
Results:
	Min: 0.6363636363636364
	Max: 0.9090909090909091
	Mean: 0.7636363636363637
	Std: 0.09270944570168699
[all] 10-fold cross validation with Random Forest Classifier (500 trees):
Results:
	Min: 0.5
	Max: 1.0
	Mean: 0.8200000000000001
	Std: 0.15790292376436016
[all] 5-fold cross validation with Random Forest Classifier (1000 trees):
Results:
	Min: 0.6363636363636364
	Max: 0.9090909090909091
	Mean: 0.7636363636363637
	Std: 0.09270944570168699
[all] 10-fold cross validation with Random Forest Classifier (1000 trees):
Results:
	Min: 0.4
	Max: 1.0
	Mean: 0.78
	Std: 0.1814754345175

### Test/train split validation

In [7]:
from sklearn.model_selection import train_test_split

TEST_RATIO = .2

def validate(name: str, data: pd.DataFrame):
    for t in TREES:
        X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=TEST_RATIO)
        rf = create_rf(t)
        rf.fit(X_train, y_train)
        score = rf.score(X_test, y_test)
        print(f'[{name}] validation with Random Forest Classifier ({t} trees):')
        print(f'\tTrain size: {X_train.shape}\n\tTest size: {X_test.shape}')
        print(f'\tAccuracy score: {score}')

In [8]:
for name, data in selected.items():
    validate(name, data)

[all] validation with Random Forest Classifier (100 trees):
	Train size: (44, 14)
	Test size: (11, 14)
	Accuracy score: 0.6363636363636364
[all] validation with Random Forest Classifier (500 trees):
	Train size: (44, 14)
	Test size: (11, 14)
	Accuracy score: 0.9090909090909091
[all] validation with Random Forest Classifier (1000 trees):
	Train size: (44, 14)
	Test size: (11, 14)
	Accuracy score: 0.8181818181818182
[night] validation with Random Forest Classifier (100 trees):
	Train size: (44, 47)
	Test size: (11, 47)
	Accuracy score: 0.8181818181818182
[night] validation with Random Forest Classifier (500 trees):
	Train size: (44, 47)
	Test size: (11, 47)
	Accuracy score: 0.8181818181818182
[night] validation with Random Forest Classifier (1000 trees):
	Train size: (44, 47)
	Test size: (11, 47)
	Accuracy score: 0.6363636363636364
[day] validation with Random Forest Classifier (100 trees):
	Train size: (44, 2)
	Test size: (11, 2)
	Accuracy score: 0.7272727272727273
[day] validation with

In [9]:
for name, data in hourly_selected.items():
    validate(name, data)

[all] validation with Random Forest Classifier (100 trees):
	Train size: (44, 24)
	Test size: (11, 24)
	Accuracy score: 0.9090909090909091
[all] validation with Random Forest Classifier (500 trees):
	Train size: (44, 24)
	Test size: (11, 24)
	Accuracy score: 0.6363636363636364
[all] validation with Random Forest Classifier (1000 trees):
	Train size: (44, 24)
	Test size: (11, 24)
	Accuracy score: 0.9090909090909091
[night] validation with Random Forest Classifier (100 trees):
	Train size: (44, 61)
	Test size: (11, 61)
	Accuracy score: 0.8181818181818182
[night] validation with Random Forest Classifier (500 trees):
	Train size: (44, 61)
	Test size: (11, 61)
	Accuracy score: 0.7272727272727273
[night] validation with Random Forest Classifier (1000 trees):
	Train size: (44, 61)
	Test size: (11, 61)
	Accuracy score: 1.0
[day] validation with Random Forest Classifier (100 trees):
	Train size: (44, 8)
	Test size: (11, 8)
	Accuracy score: 0.9090909090909091
[day] validation with Random Forest 

### ExtraTreesClassifier

In [10]:
from sklearn.ensemble import ExtraTreesClassifier

def create_et(n_trees):
    return ExtraTreesClassifier(n_estimators=n_trees)

def cross_validate(name: str, data: pd.DataFrame):
    for t in TREES:
        for f in FOLDS:
            clf = create_et(t)
            print(f'[{name}] {f}-fold cross validation with Extra Trees Classifier ({t} trees):')
            scores = cross_val_score(clf, data, y, cv=f)
            print(f'Results:\n\tMin: {min(scores)}\n\tMax: {max(scores)}\n\tMean: {np.mean(scores)}\n\tStd: {np.std(scores)}')
            
def validate(name: str, data: pd.DataFrame):
    for t in TREES:
        X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=TEST_RATIO)
        clf = create_et(t)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print(f'[{name}] validation with Extra Trees Classifier ({t} trees):')
        print(f'\tTrain size: {X_train.shape}\n\tTest size: {X_test.shape}')
        print(f'\tAccuracy score: {score}')

In [11]:
for name, data in selected.items():
    cross_validate(name, data)

[all] 5-fold cross validation with Extra Trees Classifier (100 trees):
Results:
	Min: 0.6363636363636364
	Max: 0.9090909090909091
	Mean: 0.7454545454545454
	Std: 0.10601730717900547
[all] 10-fold cross validation with Extra Trees Classifier (100 trees):
Results:
	Min: 0.4
	Max: 1.0
	Mean: 0.8
	Std: 0.17826322609494585
[all] 5-fold cross validation with Extra Trees Classifier (500 trees):
Results:
	Min: 0.6363636363636364
	Max: 0.9090909090909091
	Mean: 0.7636363636363637
	Std: 0.09270944570168699
[all] 10-fold cross validation with Extra Trees Classifier (500 trees):
Results:
	Min: 0.4
	Max: 1.0
	Mean: 0.8133333333333332
	Std: 0.17269111795984826
[all] 5-fold cross validation with Extra Trees Classifier (1000 trees):
Results:
	Min: 0.6363636363636364
	Max: 0.9090909090909091
	Mean: 0.7818181818181819
	Std: 0.092709445701687
[all] 10-fold cross validation with Extra Trees Classifier (1000 trees):
Results:
	Min: 0.4
	Max: 1.0
	Mean: 0.7966666666666666
	Std: 0.1779200819344336
[night] 5-f

In [12]:
for name, data in hourly_selected.items():
    cross_validate(name, data)

[all] 5-fold cross validation with Extra Trees Classifier (100 trees):
Results:
	Min: 0.6363636363636364
	Max: 1.0
	Mean: 0.8
	Std: 0.12060453783110545
[all] 10-fold cross validation with Extra Trees Classifier (100 trees):
Results:
	Min: 0.4
	Max: 1.0
	Mean: 0.7766666666666666
	Std: 0.17194960502038575
[all] 5-fold cross validation with Extra Trees Classifier (500 trees):
Results:
	Min: 0.6363636363636364
	Max: 1.0
	Mean: 0.8181818181818181
	Std: 0.1149919149152138
[all] 10-fold cross validation with Extra Trees Classifier (500 trees):
Results:
	Min: 0.4
	Max: 1.0
	Mean: 0.8133333333333332
	Std: 0.1944793619441976
[all] 5-fold cross validation with Extra Trees Classifier (1000 trees):
Results:
	Min: 0.6363636363636364
	Max: 1.0
	Mean: 0.8
	Std: 0.12060453783110545
[all] 10-fold cross validation with Extra Trees Classifier (1000 trees):
Results:
	Min: 0.4
	Max: 1.0
	Mean: 0.7766666666666666
	Std: 0.17194960502038575
[night] 5-fold cross validation with Extra Trees Classifier (100 trees

In [13]:
for name, data in selected.items():
    validate(name, data)

[all] validation with Extra Trees Classifier (100 trees):
	Train size: (44, 14)
	Test size: (11, 14)
	Accuracy score: 0.8181818181818182
[all] validation with Extra Trees Classifier (500 trees):
	Train size: (44, 14)
	Test size: (11, 14)
	Accuracy score: 0.9090909090909091
[all] validation with Extra Trees Classifier (1000 trees):
	Train size: (44, 14)
	Test size: (11, 14)
	Accuracy score: 0.8181818181818182
[night] validation with Extra Trees Classifier (100 trees):
	Train size: (44, 47)
	Test size: (11, 47)
	Accuracy score: 0.9090909090909091
[night] validation with Extra Trees Classifier (500 trees):
	Train size: (44, 47)
	Test size: (11, 47)
	Accuracy score: 0.9090909090909091
[night] validation with Extra Trees Classifier (1000 trees):
	Train size: (44, 47)
	Test size: (11, 47)
	Accuracy score: 0.9090909090909091
[day] validation with Extra Trees Classifier (100 trees):
	Train size: (44, 2)
	Test size: (11, 2)
	Accuracy score: 0.8181818181818182
[day] validation with Extra Trees C

In [14]:
for name, data in hourly_selected.items():
    validate(name, data)

[all] validation with Extra Trees Classifier (100 trees):
	Train size: (44, 24)
	Test size: (11, 24)
	Accuracy score: 0.45454545454545453
[all] validation with Extra Trees Classifier (500 trees):
	Train size: (44, 24)
	Test size: (11, 24)
	Accuracy score: 1.0
[all] validation with Extra Trees Classifier (1000 trees):
	Train size: (44, 24)
	Test size: (11, 24)
	Accuracy score: 0.9090909090909091
[night] validation with Extra Trees Classifier (100 trees):
	Train size: (44, 61)
	Test size: (11, 61)
	Accuracy score: 0.7272727272727273
[night] validation with Extra Trees Classifier (500 trees):
	Train size: (44, 61)
	Test size: (11, 61)
	Accuracy score: 0.9090909090909091
[night] validation with Extra Trees Classifier (1000 trees):
	Train size: (44, 61)
	Test size: (11, 61)
	Accuracy score: 0.8181818181818182
[day] validation with Extra Trees Classifier (100 trees):
	Train size: (44, 8)
	Test size: (11, 8)
	Accuracy score: 0.9090909090909091
[day] validation with Extra Trees Classifier (500

### SVM

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

scores = ['precision', 'recall']
tuned_parameters = [
    {
        'kernel': ['rbf'], 
        'gamma': [1e-3, 1e-4],
        'C': [1, 10, 100, 1000]
    },
    {
        'kernel': ['linear'],
        'C': [1, 10, 100, 1000]
    },
]

def grid_search_svc(name: str, data: pd.DataFrame):
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=TEST_RATIO, random_state=0)
    
    for score in scores:
        print(f'[{name}] SVC tuning for {score}')

        clf = GridSearchCV(SVC(), tuned_parameters, scoring=f'{score}_macro', n_jobs=-1)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print(clf.best_params_)
        print("Grid scores on development set:")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))

In [None]:
for name, data in selected.items():
    grid_search_svc(name, data)

In [None]:
for name, data in hourly_selected.items():
    grid_search_svc(name, data)

[all] SVC tuning for precision
Best parameters set found on development set:
{'C': 1, 'kernel': 'linear'}
Grid scores on development set:
0.272 (+/-0.022) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.272 (+/-0.022) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.272 (+/-0.022) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.272 (+/-0.022) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.272 (+/-0.022) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.272 (+/-0.022) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.272 (+/-0.022) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.272 (+/-0.022) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.629 (+/-0.313) for {'C': 1, 'kernel': 'linear'}
0.629 (+/-0.313) for {'C': 10, 'kernel': 'linear'}
0.629 (+/-0.313) for {'C': 100, 'kernel': 'linear'}
0.629 (+/-0.313) for {'C': 1000, 'kernel': 'linear'}
              precision    recall  f1-score   support

           0       0.86      0.75      0.80         8
           1       0.50    