In [17]:
import pandas as pd
pd.set_option('display.max_column', 250)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-notebook')
from matplotlib import rcParams
rcParams['figure.figsize'] = (6, 4)
rcParams['figure.dpi'] = 150

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [3]:
#split data
from read_data import read_sample_data

X_train, X_test, y_train, y_test = read_sample_data('../data/SINE.arff')



In [7]:
#standard scaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [4]:
#label encoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

## Random Forest

In [7]:
#random grid for random forests
random_grid = {'bootstrap': [True, False],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
              }

In [11]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=rf, 
                               param_distributions=random_grid, 
                               n_iter=300, cv=5, verbose=5, 
                               random_state=42, n_jobs=-1
                              )
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   40.9s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 25.0min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 33.4min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 44.1min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed: 55.4min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 68.8min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 70.3min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=300, n_jobs=-1,
          param_distributions={'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=5)

In [13]:
print("Score in test set:", rf_random.score(X_test, y_test))
print(rf_random.best_params_)

Score in test set: 0.8683333333333333
{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': True}


## Gradient Boosting Machine

In [14]:
random_grid = {'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
              }

In [18]:
gbm = GradientBoostingClassifier()
gbm_random = RandomizedSearchCV(estimator=gbm, 
                               param_distributions=random_grid, 
                               n_iter=300, cv=5, verbose=5, 
                               random_state=42, n_jobs=-1
                              )
# Fit the random search model
gbm_random.fit(X_train, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 24.7min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 53.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 106.0min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 143.6min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 187.7min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed: 243.3min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 339.5min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 346.8min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=300, n_jobs=-1,
          param_distributions={'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=5)

In [21]:
print("Score in test set:", gbm_random.score(X_test, y_test))
print("Score in test set:", gbm_random.score(X_train, y_train))
print(gbm_random.best_params_)


Score in test set: 0.8626444444444444
Score in test set: 0.9966
{'n_estimators': 400, 'max_depth': 10, 'learning_rate': 0.05}


In [22]:
params = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 10, 50, 100],
    'gamma': [0.001, 0.01, 0.1, 1]}

svm = SVC()

In [27]:
grid_search = RandomizedSearchCV(estimator=svm, param_distributions=params, 
                                 n_iter=300, cv=3, verbose=5, random_state=42, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   43.3s
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:   59.4s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid='warn', n_iter=300, n_jobs=-1,
          param_distributions={'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 10, 50, 100], 'gamma': [0.001, 0.01, 0.1, 1]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=5)

In [28]:
print("Score in test set:", grid_search.score(X_test, y_test))
print("Score in test set:", grid_search.score(X_train, y_train))
print(grid_search.best_params_)


Score in test set: 0.8630888888888889
Score in test set: 0.872
{'kernel': 'rbf', 'gamma': 1, 'C': 50}
