In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import sklearn.metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier

In [8]:
train = pd.read_csv('./data/train.csv')

In [10]:
test = pd.read_csv('./data/test.csv')

In [20]:
y = train['label']
X = train.drop('label', axis=1)

## Grid Search

In [21]:
forest = RandomForestClassifier(random_state = 42)

In [22]:
param_grid = [
    {
        'n_estimators': [5],                     # 트리를 몇개 만들것인가?
        'max_features': [0.3, 0.4],              # 노드를 새로 만들때마다 모든조건에대해 impurity를 계산할지 어떡할지
                                                 # 숫자가 작을수록 random함
        'bootstrap': [True, False]               # True: 복원추출방식, False: 비복원추출방식
    }
    
]

In [23]:
grid_search = GridSearchCV(forest,
                          param_grid,
                          cv = 2,                      # fold 갯수
                          scoring='accuracy', 
                          return_train_score = True,
                          n_jobs = 2,
                          verbose = 1)

In [24]:
grid_search.fit(X, y)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:   53.5s finished


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=2,
             param_grid=[{'bo

In [26]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 0.3, 'n_estimators': 5}

In [28]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.9085 {'bootstrap': True, 'max_features': 0.3, 'n_estimators': 5}
0.9074047619047619 {'bootstrap': True, 'max_features': 0.4, 'n_estimators': 5}
0.9190238095238096 {'bootstrap': False, 'max_features': 0.3, 'n_estimators': 5}
0.9165714285714286 {'bootstrap': False, 'max_features': 0.4, 'n_estimators': 5}


## Random Search

In [30]:
from scipy.stats import uniform as sp_uniform # float uniform dist
from scipy.stats import randint as sp_randint # int uniform dist

In [31]:
param_dist={'max_features': sp_uniform(0.3, 0.5),
            'bootstrap': [True, False],
            'criterion': ['gini', 'entropy'],
            'n_estimators': [2],
            'max_depth': sp_randint(5, 25) # from depth 5 to 25
           }

In [32]:
random_search = RandomizedSearchCV(forest,
                                  param_dist,
                                  n_iter = 20,
                                  cv = 5,
                                  verbose = 1,
                                  n_jobs = 2)

In [33]:
random_search.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  3.9min
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  9.9min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [34]:
random_search.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 0.778808895579415,
 'n_estimators': 2}