In [1]:
import warnings

import numpy as np
import pandas as pd

from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)
pd.set_option("float_format", "{:.4f}".format)

In [2]:
RANDOM_STATE = 42

# Data Definition

In [3]:
_data = load_iris()
data = _data['data']
feature_names = _data['feature_names']

df = pd.DataFrame(data, columns=feature_names)
df['target'] = _data['target']
df['target'] = df['target'].map(
    {
        0: 'Iris-Setosa',
        1: 'Iris-Versicolour',
        2: 'Iris-Virginica',
    }
)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,Iris-Setosa
1,4.9,3.0,1.4,0.2,Iris-Setosa
2,4.7,3.2,1.3,0.2,Iris-Setosa
3,4.6,3.1,1.5,0.2,Iris-Setosa
4,5.0,3.6,1.4,0.2,Iris-Setosa


In [4]:
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [5]:
print(
    X_train.shape,
    X_test.shape,
)

(120, 4) (30, 4)


# Cross Validation

## K-Fold Cross Validation

### K-Fold

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score

In [7]:
SPLIT_NUM = 5

In [8]:
kfold = KFold(n_splits=SPLIT_NUM)

In [9]:
ex = kfold.split(X_train)

print(type(ex))
next(ex)

<class 'generator'>


(array([ 24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,
         37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,
         50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,
         63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
         76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
         89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101,
        102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
        115, 116, 117, 118, 119]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23]))

In [10]:
acc_train_list = []
acc_val_list = []

In [11]:
model = DecisionTreeClassifier(random_state=RANDOM_STATE)

for train_index, val_index in kfold.split(X_train, y_train):
    # Dataset Definition
    X_val, y_val = X_train.iloc[val_index], y_train.iloc[val_index]

    # Train
    model.fit(X_train, y_train)

    # Prediction
    pred_train = model.predict(X_train)
    pred_val = model.predict(X_val)

    # Train/Val Results
    acc_train = accuracy_score(y_train, pred_train)
    acc_val = accuracy_score(y_val, pred_val)

    acc_train_list.append(acc_train)
    acc_val_list.append(acc_val)

In [12]:
print(
    "\n === train === \n",
    acc_train_list,
    np.mean(acc_train_list),
    "\n === val === \n",
    acc_val_list,
    np.mean(acc_val_list),
)


 === train === 
 [1.0, 1.0, 1.0, 1.0, 1.0] 1.0 
 === val === 
 [1.0, 1.0, 1.0, 1.0, 1.0] 1.0


### Train2Val

In [13]:
kfold = KFold(n_splits=SPLIT_NUM)

acc_train_list = []
acc_val_list = []

n_iter = 0
model = DecisionTreeClassifier(random_state=RANDOM_STATE)

In [14]:
for train_index, val_index in kfold.split(X_train, y_train):
    # Dataset Definition
    X_val, y_val = X_train.iloc[val_index], y_train.iloc[val_index]

    # Train
    model.fit(X_train, y_train)

    # Prediction
    pred_train = model.predict(X_train)
    pred_val = model.predict(X_val)

    # Train/Val Results
    acc_train = accuracy_score(y_train, pred_train)
    acc_val = accuracy_score(y_val, pred_val)

    acc_train_list.append(acc_train)
    acc_val_list.append(acc_val)

    # Iteration Info
    n_iter += 1

    train_size = X_train.shape[0]
    val_size = X_val.shape[0]

    train_label = y_train.value_counts()
    val_label = y_val.value_counts()

    # Progress
    print(f"\n Fold: {n_iter}, Train_size: {train_size}, Val_size: {val_size}")
    print(f"Train accuracy: {acc_train}, Val accuracy: {acc_val}")
    print(f"Val Index: {val_index}")
    print(f"Train Label: {train_label}")
    print(f"Val Label: {val_label}")

# Results
print(f"\n ======== \n")
print(f"TRAIN Accuracy: {np.mean(acc_train_list)}")
print(f"VAL Accuracy: {np.mean(acc_val_list)}")


 Fold: 1, Train_size: 120, Val_size: 24
Train accuracy: 1.0, Val accuracy: 1.0
Val Index: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
Train Label: target
Iris-Versicolour    41
Iris-Setosa         40
Iris-Virginica      39
Name: count, dtype: int64
Val Label: target
Iris-Setosa         10
Iris-Versicolour     8
Iris-Virginica       6
Name: count, dtype: int64

 Fold: 2, Train_size: 120, Val_size: 24
Train accuracy: 1.0, Val accuracy: 1.0
Val Index: [24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47]
Train Label: target
Iris-Versicolour    41
Iris-Setosa         40
Iris-Virginica      39
Name: count, dtype: int64
Val Label: target
Iris-Setosa         9
Iris-Versicolour    8
Iris-Virginica      7
Name: count, dtype: int64

 Fold: 3, Train_size: 120, Val_size: 24
Train accuracy: 1.0, Val accuracy: 1.0
Val Index: [48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71]
Train Label: target
Iris-Versicolour    41
Iris-Seto

## Cross Validation Score

[Reference] https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html

In [15]:
from sklearn.model_selection import cross_validate, cross_val_score

In [16]:
model = DecisionTreeClassifier(random_state=RANDOM_STATE)
cv_scores = cross_validate(
    model,
    X_train,
    y_train,
    cv=5,
    scoring="accuracy",
)

In [17]:
cv_scores

{'fit_time': array([0.00175309, 0.00153232, 0.00105095, 0.0011301 , 0.0009222 ]),
 'score_time': array([0.00117898, 0.00088596, 0.00072813, 0.0008049 , 0.00062275]),
 'test_score': array([0.95833333, 1.        , 0.83333333, 0.95833333, 0.95833333])}

## Grid Search Cross Validation

[Reference 1, Grid Serch]
 https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

[Reference 2, DecisionTreeClassifier]
 https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [18]:
from sklearn.model_selection import GridSearchCV

### Set-up

In [19]:
# model
model = DecisionTreeClassifier(random_state=RANDOM_STATE)

In [20]:
# Hyper-parameter 정의
cv_num = 5
max_depth = [1, 2, 3, 10]
min_samples_split = [5, 10, 15]

In [21]:
# Grid 정의
param_grid = {
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
}

### GridSearchCV

In [22]:
grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
    cv=cv_num,
    scoring="accuracy",
    return_train_score=True,
)

grid_search.fit(X_train, y_train)

In [22]:
# grid_search.cv_results_

In [23]:
df_scores = pd.DataFrame(grid_search.cv_results_)

df_scores[
    [
        "params",
        "rank_test_score",
        "mean_test_score",
        "split0_test_score",
        "split1_test_score",
    ]
].sort_values("rank_test_score", ascending=True)

Unnamed: 0,params,rank_test_score,mean_test_score,split0_test_score,split1_test_score
6,"{'max_depth': 3, 'min_samples_split': 5}",1,0.9333,0.9583,1.0
7,"{'max_depth': 3, 'min_samples_split': 10}",1,0.9333,0.9583,1.0
8,"{'max_depth': 3, 'min_samples_split': 15}",1,0.9333,0.9583,1.0
10,"{'max_depth': 10, 'min_samples_split': 10}",1,0.9333,0.9583,1.0
11,"{'max_depth': 10, 'min_samples_split': 15}",1,0.9333,0.9583,1.0
9,"{'max_depth': 10, 'min_samples_split': 5}",6,0.925,0.9583,1.0
3,"{'max_depth': 2, 'min_samples_split': 5}",7,0.9167,0.9583,0.9167
4,"{'max_depth': 2, 'min_samples_split': 10}",7,0.9167,0.9583,0.9167
5,"{'max_depth': 2, 'min_samples_split': 15}",7,0.9167,0.9583,0.9167
0,"{'max_depth': 1, 'min_samples_split': 5}",10,0.675,0.7083,0.6667


In [24]:
print(grid_search.best_params_, grid_search.best_score_, grid_search.best_estimator_)

{'max_depth': 3, 'min_samples_split': 5} 0.9333333333333333 DecisionTreeClassifier(max_depth=3, min_samples_split=5, random_state=42)


### Predict

In [25]:
estimator = grid_search.best_estimator_
pred = estimator.predict(X_test)

In [26]:
accuracy_score(y_test, pred)

1.0

# End of Documents