In [1]:
import pandas as pd
pd.set_option('display.max_column', 250)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-notebook')
from matplotlib import rcParams
rcParams['figure.figsize'] = (6, 4)
rcParams['figure.dpi'] = 150

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from read_data import read_sample_data, random_gridsearch

In [2]:
#split data
X_train, X_test, y_train, y_test = read_sample_data('../data/covtypeNorm.arff')



standard scaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [3]:
#label encoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

## Random Forest

In [4]:
#random grid for random forests
rf_grid = {'bootstrap': [True, False],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
              }

In [5]:
from read_data import random_gridsearch

rf_search = random_gridsearch(RandomForestClassifier(), rf_grid)
rf_search.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 21.0min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 30.0min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 40.3min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 41.1min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=300, n_jobs=-1,
          param_distributions={'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=5)

In [6]:
print("Score in test set:", rf_search.score(X_test, y_test))
print("Score in test set:", rf_search.score(X_test, y_test))
print(rf_search.best_params_)

Score in test set: 0.7960632764595182
Score in test set: 0.7960632764595182
{'n_estimators': 1200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}


## Gradient Boosting Machine

In [7]:
gbm_grid = {'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5],
            'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
            'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
           }

In [8]:
gbm_search = random_gridsearch(GradientBoostingClassifier(), gbm_grid)
gbm_search.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 40.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 115.2min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 228.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 362.6min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 532.9min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 728.5min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 737.5min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=300, n_jobs=-1,
          param_distributions={'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=5)

In [9]:
print("Score in train set:", gbm_search.score(X_train, y_train))
print("Score in test set:", gbm_search.score(X_test, y_test))
print(gbm_search.best_params_)

Score in train set: 1.0
Score in test set: 0.7911171989472442
{'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.05}


## Support Vector Machine

In [10]:
svm_grid = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 10, 50, 100],
    'gamma': [0.001, 0.01, 0.1, 1]}

In [11]:
svm_search = random_gridsearch(SVC(), svm_grid)
svm_search.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   54.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:  5.8min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid='warn', n_iter=300, n_jobs=-1,
          param_distributions={'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 10, 50, 100], 'gamma': [0.001, 0.01, 0.1, 1]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=5)

In [12]:
print("Score in train set:", svm_search.score(X_train, y_train))
print("Score in test set:", svm_search.score(X_test, y_test))
print(svm_search.best_params_)

Score in train set: 0.796
Score in test set: 0.754367964556294
{'kernel': 'poly', 'gamma': 1, 'C': 0.5}


## XG Boost

In [13]:
xgb_grid = {"learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
              "max_depth": [ 3, 4, 5, 6, 8, 10, 12, 15],
              "min_child_weight": [ 1, 3, 5, 7],
              "gamma": [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
              "colsample_bytree": [ 0.3, 0.4, 0.5 , 0.7]
             }

In [33]:
xgb_search = random_gridsearch(XGBClassifier(), xgb_grid)
xgb_search.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 17.7min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
          fit_params=None, iid='warn', n_iter=300, n_jobs=-1,
          param_distributions={'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3], 'max_depth': [3, 4, 5, 6, 8, 10, 12, 15], 'min_child_weight': [1, 3, 5, 7], 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4], 'colsample_bytree': [0.3, 0.4, 0.5, 0.7]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=5)

In [34]:
print("Score in train set:", xgb_search.score(X_train, y_train))
print("Score in test set:", xgb_search.score(X_test, y_test))
print(xgb_search.best_params_)

Score in train set: 1.0
Score in test set: 0.785964528516767
{'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.25, 'gamma': 0.0, 'colsample_bytree': 0.7}


In [32]:
for i in X_train.columns:
    X_train[i] = X_train[i].astype(float)
    X_test[i] = X_test[i].astype(float)