In [1]:
import pandas as pd
pd.set_option('display.max_column', 250)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-notebook')
from matplotlib import rcParams
rcParams['figure.figsize'] = (6, 4)
rcParams['figure.dpi'] = 150

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from read_data import read_sample_data, random_gridsearch

In [2]:
#split data
X_train, X_test, y_train, y_test = read_sample_data('../data/SEA.arff')



standard scaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [3]:
#label encoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

## Random Forest

In [4]:
#random grid for random forests
rf_grid = {'bootstrap': [True, False],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
              }

In [5]:
from read_data import random_gridsearch

rf_search = random_gridsearch(RandomForestClassifier(), rf_grid)
rf_search.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 21.3min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 28.5min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 29.1min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=300, n_jobs=-1,
          param_distributions={'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=5)

In [8]:
print("Score in test set:", rf_search.score(X_test, y_test))
print("Score in test set:", rf_search.score(X_test, y_test))
print(rf_search.best_params_)

Score in test set: 0.8684
Score in test set: 0.8684
{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}


## Gradient Boosting Machine

In [11]:
gbm_grid = {'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5],
            'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
            'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
           }

In [12]:
gbm_search = random_gridsearch(GradientBoostingClassifier(), gbm_grid)
gbm_search.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 26.5min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 49.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 76.0min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 109.7min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 155.6min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 158.1min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=300, n_jobs=-1,
          param_distributions={'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=5)

In [13]:
print("Score in train set:", gbm_search.score(X_train, y_train))
print("Score in test set:", gbm_search.score(X_test, y_test))
print(gbm_search.best_params_)

Score in train set: 0.99
Score in test set: 0.8627333333333334
{'n_estimators': 800, 'max_depth': 10, 'learning_rate': 0.05}


## Support Vector Machine

In [None]:
svm_grid = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 10, 50, 100],
    'gamma': [0.001, 0.01, 0.1, 1]}

In [None]:
svm_search = random_gridsearch(SVC(), svm_grid)
svm_search.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  5.4min


In [None]:
print("Score in train set:", svm_search.score(X_train, y_train))
print("Score in test set:", svm_search.score(X_test, y_test))
print(svm_search.best_params_)

## XG Boost

In [None]:
xgb_grid = {"learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
              "max_depth": [ 3, 4, 5, 6, 8, 10, 12, 15],
              "min_child_weight": [ 1, 3, 5, 7],
              "gamma": [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
              "colsample_bytree": [ 0.3, 0.4, 0.5 , 0.7]
             }

In [None]:
xgb_search = random_gridsearch(XGBClassifier(), xgb_grid)
xgb_search.fit(X_train, y_train)

In [None]:
print("Score in train set:", xgb_search.score(X_train, y_train))
print("Score in test set:", xgb_search.score(X_test, y_test))
print(xgb_search.best_params_)