In [1]:
import pandas as pd
pd.set_option('display.max_column', 250)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-notebook')
from matplotlib import rcParams
rcParams['figure.figsize'] = (6, 4)
rcParams['figure.dpi'] = 150

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from read_data import read_sample_data, random_gridsearch

  from numpy.core.umath_tests import inner1d


In [2]:
#split data
X_train, X_test, y_train, y_test = read_sample_data('datasets/GasSensor.arff')



label encoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

## Random Forest

In [3]:
#random grid for random forests
rf_grid = {'bootstrap': [True, False],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
              }

In [None]:
from read_data import random_gridsearch

rf_search = random_gridsearch(RandomForestClassifier(), rf_grid)
rf_search.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 25.6min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 53.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 80.0min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 114.5min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 155.2min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 158.3min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=300, n_jobs=-1,
          param_distributions={'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=5)

In [None]:
print("Score in test set:", rf_search.score(X_test, y_test))
print("Score in test set:", rf_search.score(X_test, y_test))
print(rf_search.best_params_)

Score in test set: 0.9924803591470258
Score in test set: 0.9924803591470258
{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 70, 'bootstrap': False}


## Gradient Boosting Machine

In [3]:
gbm_grid = {'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5],
            'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
            'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
           }

In [None]:
gbm_search = random_gridsearch(GradientBoostingClassifier(), gbm_grid)
gbm_search.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  5.2min


In [None]:
print("Score in train set:", gbm_search.score(X_train, y_train))
print("Score in test set:", gbm_search.score(X_test, y_test))
print(gbm_search.best_params_)

## Support Vector Machine

In [None]:
svm_grid = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 10, 50, 100],
    'gamma': [0.001, 0.01, 0.1, 1]}

In [None]:
svm_search = random_gridsearch(SVC(), svm_grid)
svm_search.fit(X_train, y_train)

In [None]:
print("Score in train set:", svm_search.score(X_train, y_train))
print("Score in test set:", svm_search.score(X_test, y_test))
print(svm_search.best_params_)

## XG Boost

In [None]:
xgb_grid = {"learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
              "max_depth": [ 3, 4, 5, 6, 8, 10, 12, 15],
              "min_child_weight": [ 1, 3, 5, 7],
              "gamma": [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
              "colsample_bytree": [ 0.3, 0.4, 0.5 , 0.7]
             }

In [None]:
xgb_search = random_gridsearch(XGBClassifier(), xgb_grid)
xgb_search.fit(X_train, y_train)

In [None]:
print("Score in train set:", xgb_search.score(X_train, y_train))
print("Score in test set:", xgb_search.score(X_test, y_test))
print(xgb_search.best_params_)