<a href="https://colab.research.google.com/github/kjmobile/lb/blob/main/7_Cross_Validation_in_Decision_Tree__Q.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cross Validation and Grid Search

## Validation Set

In [None]:
import pandas as pd
wine = pd.read_csv('https://raw.githubusercontent.com/kjmobile/data/main/ml/wine_csv.csv')

In [None]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [None]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=17)

In [None]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=17)

In [None]:
print(sub_input.shape, val_input.shape)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=17)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))
print(dt.score(test_input,test_target))

# In Decision Tree, standard scaling is often not necessary because :
# 1. The split point remains the same even after the scaling.
# 2. It is robust to outliers

In [None]:
dt?

## Cross Validation

In [None]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores)
#if you change the number of folds from default 5 to 10,
#what would be the average accuracy change into?

In [None]:
cross_validate?

In [None]:
import numpy as np
print(np.mean(scores['test_score']))


In [None]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

In [None]:
# when more complex specification is needed for cv parameter.
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=17)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

## Tuning the Hyperparameter

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=17), params, n_jobs=-1)

# it will tain 25 models = 5 params x 5 folds(default)
# n_jbs=-1 means to use all cores available in your computer.

In [None]:
GridSearchCV?

In [None]:
gs.fit(train_input, train_target)

In [None]:
dt_gs = gs.best_estimator_
print(dt_gs.score(train_input, train_target))

# Once Scikit-learn's grid search has completed training,
# it automatically retrains the model using the entire training set
# with the parameter combination that yielded the highest validation score out of the 25 models.
# This best-performing "model" is saved in the "best_estimator_" attribute of the grid search object.
# You can use this model just as a regular decision tree.

In [None]:
print(gs.best_params_)

In [None]:
#  draw a plot of scores to verify the relative performance of the best params
import matplotlib.pyplot as plt
plt.figure(figsize=(10,2))
plt.plot(params['min_impurity_decrease'], gs.cv_results_['mean_test_score'])

In [None]:
print(gs.cv_results_['mean_test_score'])

In [None]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

### Adding more hyperparameter combinations

In [None]:
params_1 = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001), # 9 variations
          'max_depth': range(5, 20, 1), # 15 variations
          'min_samples_split': range(2, 100, 10) # 10 varitions
          }
          # total 1350 combinations of parameters created (=9 x 15 x 10)
          # by defaults 5 folds cross validated
          # hence, 6750 (=5 x 1350) models will be generated to be tested for cross validation

In [None]:
gs_1 = GridSearchCV(DecisionTreeClassifier(random_state=17), params_1, n_jobs=-1)
gs_1.fit(train_input, train_target)

In [None]:
print(gs_1.best_params_)

In [None]:
print(np.max(gs_1.cv_results_['mean_test_score']))

In [None]:
gs_1.cv_results_['mean_test_score'].shape

### Random Search

  + Random Search tests a random subset of the parameter space, making it faster and more suitable for high-dimensional parameter space.
  + Random Search is computationally less expensive but can uncover effective parameters in regions of the space that Grid Search might not identify too.

In [None]:
# To execute a random search, we need a random number generator (random number from a uniform distribution)
# Uniform distribution: probability distribution in which all outcomes are equally likely to occur.
from scipy.stats import uniform, randint
rgen = randint(0, 10) # random integer (discrete)
rgen.rvs(10)

In [None]:
ugen = uniform(0, 1) # random real number (continuous)
ugen.rvs(10)

In [None]:
params_rs = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [None]:
from sklearn.model_selection import RandomizedSearchCV

cv_rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=17), params_rs,
                        n_iter=100, n_jobs=-1, random_state=17)
cv_rs.fit(train_input, train_target)

In [None]:
print(cv_rs.best_params_)

In [None]:
print(np.max(cv_rs.cv_results_['mean_test_score']))

In [None]:
dt_rs = cv_rs.best_estimator_

print(dt_rs.score(test_input, test_target))

In [None]:
tree.plot_tree?

In [None]:
# Draw dt_rs plot_tree

from sklearn import tree
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(25, 20))
tree.plot_tree(dt_rs, fontsize=10, class_names=(['alcohol', 'sugar', 'pH']), filled=True)
plt.show()
