In [23]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#### bike rentals 데이터셋 활용

In [22]:
# Download bike_rentals_cleaned dataset
df_bikes = pd.read_csv('bike_rentals_cleaned.csv')

X_bikes = df_bikes.iloc[:,:-1]
y_bikes = df_bikes.iloc[:,-1]

### DecisionTree Regressor

In [2]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
tree_reg = DecisionTreeRegressor(random_state=2)

scores = cross_val_score(tree_reg, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)
rmse = np.sqrt(-scores)
print('RMSE mean: %0.2f' % (rmse.mean()))

RMSE mean: 1229.31


In [7]:
# GridSearchCV (cv=5)
def grid_search(model, params):
    grid_reg = GridSearchCV(model, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    grid_reg.fit(X_bikes, y_bikes)

    best_params = grid_reg.best_params_
    best_score=grid_reg.best_score_
    rmse=np.sqrt(-best_score)
    print("Best params:", best_params) 
    print('bestscore',rmse.round(2))

In [8]:
# Hyperparameter의 범위를 좁힘
model=DecisionTreeRegressor(random_state=1)
parms={'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01]}  # 노드 생성시의 최소 가중치
grid_search(model=model,params=parms)

Best params: {'min_weight_fraction_leaf': 0.0025}
bestscore 1253.19


In [12]:
model=DecisionTreeRegressor(random_state=1)
parms={'min_samples_split':[2, 3, 4, 5, 6, 8, 10]}  # 노드 생성시 필요한 최소한 샘플 데이터 수
grid_search(model=model,params=parms)

Best params: {'min_samples_split': 5}
bestscore 1234.18


In [13]:
model=DecisionTreeRegressor(random_state=1)
parms={'max_depth':[None, 2,4,6,8,10,20,40]}  # 트리의 최대 높이
grid_search(model=model,params=parms)

Best params: {'max_depth': None}
bestscore 1258.77


In [14]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X_bikes,y_bikes)

# Randomized CV
parm_all={'max_depth':[15,20,25],
          'min_samples_split':[4,5,6],
          'min_weight_fraction_leaf':[0.0015, 0.0025,0.003]}
rand_dtc = RandomizedSearchCV(DecisionTreeRegressor(random_state=1), 
                              param_distributions=parm_all, 
                              n_iter=10,  # 10개의 조합을 생성한 뒤 최고점 가져오기
                              scoring='neg_mean_squared_error', 
                              cv=5, n_jobs=-1, random_state=2)
rand_dtc.fit(X_train,y_train)
print(rand_dtc.best_estimator_)
print('Training score',np.sqrt(-rand_dtc.best_score_).round(2))


best_model = rand_dtc.best_estimator_
y_pred = best_model.predict(X_test)

from sklearn.metrics import mean_squared_error
rmse_test = mean_squared_error(y_test, y_pred)**0.5
print('Test score: {:.3f}'.format(rmse_test))

DecisionTreeRegressor(max_depth=25, min_samples_split=6,
                      min_weight_fraction_leaf=0.0025, random_state=1)
Training score 921.57
Test score: 789.089


In [15]:
from sklearn.model_selection import KFold, StratifiedKFold

# inner CV와 outer CV 정의
inner_cv=KFold(n_splits=3,shuffle=True,random_state=0)
out_cv=KFold(n_splits=5,shuffle=True,random_state=0)

parm_all={'max_depth':[15,20,25],
          'min_samples_split':[4,5,6],
          'min_weight_fraction_leaf':[0.0015, 0.0025,0.003]}
rand_dtc = RandomizedSearchCV(DecisionTreeRegressor(random_state=1), 
                              param_distributions=parm_all, 
                              n_iter=10, 
                              scoring='neg_mean_squared_error', 
                              cv=inner_cv, n_jobs=-1, random_state=2)
scores=cross_val_score(rand_dtc,X_bikes,y_bikes,scoring='neg_mean_squared_error',cv=out_cv)    

rmse=np.sqrt(-scores)
print("RMSE: %.3f, +/-%.3f" %(np.mean(rmse),np.std(rmse)))

RMSE: 870.462, +/-68.600


### RandomForest Regressor

In [16]:
from sklearn.ensemble import RandomForestRegressor  # bagging의 한 종류
rf = RandomForestRegressor(random_state=2, n_jobs=-1)

scores = cross_val_score(rf, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)

# Take square root of the scores
rmse = np.sqrt(-scores)

# Display mean score
print('RMSE mean: %0.2f' % (rmse.mean()))

RMSE mean: 1006.17


In [17]:
parms={'n_estimators':[50, 100, 200, 300, 500]}  # bootstrap 샘플 개수 (default=100)
grid_search(rf,parms)

Best params: {'n_estimators': 50}
bestscore 1002.37


In [18]:
parms={'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]}
grid_search(rf,parms)

Best params: {'min_weight_fraction_leaf': 0.0}
bestscore 1027.79


In [19]:
parms={'min_samples_split':[2, 3, 4, 5, 6, 8, 10]}  # 분할시 최소 노드의 수
grid_search(rf,parms)

Best params: {'min_samples_split': 2}
bestscore 1027.79


In [20]:
parms={'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4]}  # 각 노드 분할시 최대 몇개의 변수 사용 (auto=모두 사용, 0.8=80%만 사용)
grid_search(rf,parms)

Best params: {'max_features': 0.4}
bestscore 944.03


In [21]:
parms={'max_depth':[None,2, 4, 6, 8, 10, 20]}
grid_search(rf,parms)

Best params: {'max_depth': None}
bestscore 1027.79


In [24]:
def randomized_search_reg(model,params, cv=10,runs=10):
    rand_reg = RandomizedSearchCV(model, params, n_iter=runs, scoring='neg_mean_squared_error', 
                                  cv=10, n_jobs=-1, random_state=2)
    
    rand_reg.fit(X_train, y_train)

    best_model = rand_reg.best_estimator_
    best_params = rand_reg.best_params_

    print("Best params:", best_params)
    
    best_score = np.sqrt(-rand_reg.best_score_)
    print("Training score: {:.3f}".format(best_score))
    return best_model

In [27]:
rf=RandomForestRegressor(random_state=2, n_jobs=-1)
cv=KFold(n_splits=3,shuffle=True,random_state=0)

params={'n_estimators':[50,100,200,300],
        'max_features':['auto',0.4,0.5,0.6],
        'min_samples_split':[4,5,6,7],
        'max_depth':[None,2,4,6]}
best_model=randomized_search_reg(rf,params,cv=cv,runs=20)

y_pred = best_model.predict(X_test)

from sklearn.metrics import mean_squared_error
rmse_test = mean_squared_error(y_test, y_pred)**0.5
print('Test score: {:.3f}'.format(rmse_test))

Best params: {'n_estimators': 300, 'min_samples_split': 4, 'max_features': 0.5, 'max_depth': None}
Training score: 655.719
Test score: 723.939


In [28]:
from sklearn.model_selection import KFold, StratifiedKFold

inner_cv=KFold(n_splits=3,shuffle=True,random_state=0)
out_cv=KFold(n_splits=5,shuffle=True,random_state=0)

parm_all={'n_estimators':[50,100,200],
          'max_features':['auto',0.4,0.5,0.6],
          'min_samples_split':[4,5,6,7],
          'max_depth':[None,2,4,6]}
rand_rf = RandomizedSearchCV(RandomForestRegressor(random_state=2, n_jobs=-1), 
                             param_distributions=parm_all, 
                             n_iter=20, 
                             scoring='neg_mean_squared_error', 
                             cv=inner_cv, n_jobs=-1, random_state=2)
scores=cross_val_score(rand_rf,X_bikes,y_bikes,scoring='neg_mean_squared_error',cv=out_cv)    

rmse=np.sqrt(-scores)
print("RMSE: %.3f, +/-%.3f" %(np.mean(rmse),np.std(rmse)))

RMSE: 642.191, +/-80.626


### GradientBoosting Regressor

In [23]:
from sklearn.ensemble import GradientBoostingRegressor
gb_r =  GradientBoostingRegressor(random_state=2)

scores = cross_val_score(gb_r, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)
# Take square root of the scores
rmse = np.sqrt(-scores)

# Display mean score
print('RMSE mean: %0.2f' % (rmse.mean()))

RMSE mean: 940.25


In [24]:
parms={'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1.0]}  # 학습률
grid_search(gb_r,parms)

Best params: {'learning_rate': 0.2}
bestscore 949.82


In [25]:
parms={'n_estimators':[100,300, 500, 1000]}
grid_search(gb_r,parms)

Best params: {'n_estimators': 100}
bestscore 973.96


In [26]:
parms={'subsample':[1, 0.9, 0.8, 0.7, 0.6, 0.5]}  # 과대적합을 막기 위해서 사용함
grid_search(gb_r,parms)

Best params: {'subsample': 0.7}
bestscore 915.24


In [27]:
parms={'max_depth':[None, 1, 2, 3, 4]}
grid_search(gb_r,parms)

Best params: {'max_depth': 2}
bestscore 919.25


In [28]:
gb_r=GradientBoostingRegressor(random_state=2)
cv=KFold(n_splits=3,shuffle=True,random_state=0)

params={'n_estimators':[50,100,200],
        'subsample':[0.65,0.7,0.75],
        'learning_rate':[0.1,0,2,0.25],
        'max_depth':[1,2,3]}
best_model=randomized_search_reg(gb_r,params,cv=cv,runs=20)
y_pred = best_model.predict(X_test)

from sklearn.metrics import mean_squared_error
rmse_test = mean_squared_error(y_test, y_pred)**0.5
print('Test score: {:.3f}'.format(rmse_test))

Best params: {'subsample': 0.7, 'n_estimators': 200, 'max_depth': 2, 'learning_rate': 0.1}
Training score: 644.829
Test score: 597.990


In [29]:
from sklearn.model_selection import KFold, StratifiedKFold

inner_cv=KFold(n_splits=3,shuffle=True,random_state=0)
out_cv=KFold(n_splits=5,shuffle=True,random_state=0)

parm_all={'n_estimators':[50,100,200],
          'subsample':[0.65,0.7,0.75],
          'learning_rate':[0.1,0,2,0.25],
          'max_depth':[1,2,3]}
rand_rf = RandomizedSearchCV(GradientBoostingRegressor(random_state=2), 
                             param_distributions=parm_all, 
                             n_iter=20, 
                             scoring='neg_mean_squared_error', 
                             cv=inner_cv, n_jobs=-1, random_state=2)
scores=cross_val_score(rand_rf,X_bikes,y_bikes,scoring='neg_mean_squared_error',cv=out_cv)    

rmse=np.sqrt(-scores)
print("RMSE: %.3f, +/-%.3f" %(np.mean(rmse),np.std(rmse)))

RMSE: 621.252, +/-43.696


#### heart disease 데이터셋 활용

In [12]:
# Upload heart.csv to dataFrame
df_heart = pd.read_csv('heart_disease.csv')

# Show first five rows
df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [13]:
df_heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [32]:
# split data into X and y
X = df_heart.iloc[:,:-1]
y = df_heart.iloc[:,-1]

### DecisionTree/RandomForest/GradientBoosting Classifier

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold

In [34]:
# Initialize Decision Tree Classifier
model = DecisionTreeClassifier(random_state=2)
cv=StratifiedKFold(n_splits=3,shuffle=True,random_state=0)

scores = cross_val_score(model, X, y, cv=cv)

print('Accuracy:', np.round(scores, 2))
print('Accuracy mean: %0.2f' % (scores.mean()))

Accuracy: [0.7  0.76 0.72]
Accuracy mean: 0.73


In [35]:
# Initialize Random Forest Classifier
model = RandomForestClassifier(random_state=2)
cv=StratifiedKFold(n_splits=3,shuffle=True,random_state=0)

scores = cross_val_score(model, X, y, cv=cv)

print('Accuracy:', np.round(scores, 2))
print('Accuracy mean: %0.2f' % (scores.mean()))

Accuracy: [0.77 0.88 0.81]
Accuracy mean: 0.82


In [36]:
# Initialize Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=2)
cv=StratifiedKFold(n_splits=3,shuffle=True,random_state=0)

scores = cross_val_score(model, X, y, cv=cv)

print('Accuracy:', np.round(scores, 2))
print('Accuracy mean: %0.2f' % (scores.mean()))

Accuracy: [0.78 0.82 0.77]
Accuracy mean: 0.79
