# Bagging

In [63]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import graphviz # pip install

In [64]:
DATASET_DIR = os.path.join(os.path.dirname(os.getcwd()), 'datasets')

In [65]:
red_file = os.path.join(DATASET_DIR, 'winequality-red.csv')
white_file = os.path.join(DATASET_DIR, 'winequality-white.csv')

In [66]:
wine_red = pd.read_csv(red_file, sep=';')
wine_white = pd.read_csv(white_file, sep=';')

In [67]:
len(wine_red), len(wine_white)

(1599, 4898)

In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

## 1. Feature_n: 2(volatile acidity, sulphates), Binary Problem

In [69]:
new_red = wine_red.copy()
new_red['is_red'] = 1
new_white = wine_white.copy()
new_white['is_red'] = 0

wine = new_red.append(new_white, ignore_index=True)

In [70]:
X = wine[['volatile acidity', 'sulphates']]
y = wine['is_red']

### split train&test

In [71]:
from sklearn.model_selection import train_test_split

In [72]:
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=wine['is_red'], random_state=42)

In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
"""
RandomForestClassifier(n_estimators=’warn’, criterion=’gini’, max_depth=None, 
                       min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
                        min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, 
                        random_state=None, verbose=0, warm_start=False, class_weight=None)
                        
# n_estimators : integer, optional (default=10)
- The number of trees in the forest.

# criterion : string, optional (default=”gini”)
- The function to measure the quality of a split. 
- Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. 
Note: this parameter is tree-specific

# max_depth: 최대 깊이
# max_features: 최대 feature의 개수 default값이 none인데, 이는 기본적으로 feature전부가 들어간다.
이렇게 되면 esemble을 돌려도 모든 tree의 모양이 큰 차이가 없어진다. 따라서 설정을 해주는 것이 좋다.
# max_leaf_nodes: 최대 단계 노드의 수 제한
# ... (decision Tree 상속)

# bootstrap : boolean, optional (default=True)
        Whether bootstrap samples are used when building trees. If False, the
        whole datset is used to build each tree.
        
# oob_score : bool (default=False)
        Whether to use out-of-bag samples to estimate
        the generalization accuracy.
        하나의 데이터가 어디에도 들어가지 않을 확률은 36.5%가 존재한다. 
        그러한 데이터들을 따로 모아 out-of-bag라고 부르며, 이 데이터를 사용함으로써
        신뢰도를 높일 수 있다. 
        
"""
clf =  RandomForestClassifier(10, bootstrap=True, criterion='entropy', random_state=0, max_features='auto')
clf1 = tree.DecisionTreeClassifier('entropy', random_state=0)
# 맨 처음의 숫자 = 예측기(Classifier)의 숫자를 정해준다. 나무들(trees)을 몇개 만들 것인가?

clf.fit(train_X, train_y)
clf1.fit(train_X, train_y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [74]:
print(clf.score(train_X, train_y))
print(clf1.score(train_X, train_y))

0.958743842364532
0.9614121510673235


In [75]:
print(clf.score(test_X, test_y))
print(clf1.score(test_X, test_y))

0.8929230769230769
0.8966153846153846


### attribute

In [76]:
# list of DecisionTreeClassifier
clf.estimators_

[DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=209652396, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=398764591, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
            

In [77]:
clf.classes_

array([0, 1], dtype=int64)

In [78]:
# The number of features when fit is performed.
clf.n_features_

2

In [79]:
clf.feature_importances_

array([0.59069184, 0.40930816])

In [80]:
# Score of the training dataset obtained using an out-of-bag estimate.
clf.oob_score

False

In [81]:
clf.predict(test_X)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [82]:
clf.predict_proba(test_X)

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

## Quality

In [83]:
new_red = wine_red.copy()
new_white = wine_white.copy()

In [84]:
X = new_red.drop('quality', axis=1)
y = new_red['quality']

In [85]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y, random_state=42, test_size=.2)

In [86]:
train_y.head()

1542    6
1558    5
344     6
924     5
971     6
Name: quality, dtype: int64

In [87]:
clf =  RandomForestClassifier(5000, criterion='entropy', random_state=0, max_features='sqrt')
clf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [88]:
clf.score(train_X, train_y)

1.0

In [89]:
clf.score(test_X, test_y)

0.6875

# Grid Search

In [90]:
"""
GridSearchCV(estimator, param_grid, scoring=None, n_jobs=None, iid=’warn’, refit=True, cv=’warn’, 
             verbose=0, pre_dispatch=‘2*n_jobs’, error_score=’raise-deprecating’, return_train_score=False)

Exhaustive search over specified parameter values for an estimator.

Important members are fit, predict.

GridSearchCV implements a “fit” and a “score” method. It also implements “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are implemented in the estimator used.

The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid.


"""
""

''

In [96]:
# feature_len/3
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(random_state=42)


param_grid = { 
    'n_estimators': [200, 400, 600, 800],
    'max_features': ['sqrt','log2'],
    'max_depth' : [20, 25, 35, 40],
    'criterion' :['gini', 'entropy']
}
CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5)


CV_clf.fit(train_X, train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [200, 400, 600, 800], 'max_features': ['sqrt', 'log2'], 'max_depth': [20, 25, 35, 40], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [98]:
CV_clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [97]:
CV_clf.cv_results_



{'mean_fit_time': array([0.45402608, 0.84464831, 1.17726731, 1.52108703, 0.46322646,
        0.83064747, 1.16406655, 1.51308641, 0.45562606, 0.83664784,
        1.16266651, 1.51328645, 0.45302591, 0.82864738, 1.16986699,
        1.51368656, 0.45082574, 0.83024755, 1.17506723, 1.52348714,
        0.45282593, 0.83084745, 1.18346777, 1.52168694, 0.45482612,
        0.82944741, 1.17586708, 1.55428886, 0.46822677, 0.88325052,
        1.19426832, 1.51308661, 0.6194356 , 1.18386769, 1.68229613,
        2.18352485, 0.61743546, 1.15586619, 1.69189682, 2.18512487,
        0.61683521, 1.15426598, 1.67649584, 2.18092484, 0.6258359 ,
        1.16846681, 1.67709584, 2.18752508, 0.61863551, 1.19866858,
        1.67969599, 2.19232545, 0.62623582, 1.15466609, 1.67469587,
        2.2197269 , 0.63203611, 1.19126816, 1.67449594, 2.18432488,
        0.61723528, 1.18346763, 1.69029651, 2.20372605]),
 'std_fit_time': array([0.00244955, 0.02266938, 0.03323081, 0.02094884, 0.01278203,
        0.01533184, 0.005

In [100]:
CV_clf.score(train_X, train_y)

1.0

In [101]:
CV_clf.score(test_X, test_y)

0.6875

In [102]:
# 최종적으로 선택을 할때, train 점수와 test 점수 중 무엇이 높은 것을 골라야 하는가? 
# 둘다 높은 것이 좋으며, 그래도 둘 중 골라야 한다면 test가 높은 것을 고르는게 적절할 것이다. 