# 간단한 모델 만들기

In [1]:
import sklearn
print(sklearn.__version__)

0.24.1


In [2]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
iris = load_iris()

print('iris keys:', iris.keys())

iris_data = iris.data

iris_lable = iris.target
print('iris target label:', iris.target)
print('iris target name:', iris.target_names)

iris keys: dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
iris target label: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
iris target name: ['setosa' 'versicolor' 'virginica']


In [4]:
iris = load_iris()

# data set split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, 
                                                    test_size=0.2, random_state=11)
# 객체 생성
dt_clf = DecisionTreeClassifier(random_state=11)

# fit
dt_clf.fit(X_train, y_train)

# predict 
pred = dt_clf.predict(X_test)

# eval 

print(f'예측정확도: {accuracy_score(y_test, pred):.4f}')

예측정확도: 0.9333


In [5]:
dt_clf.get_depth()

5

# model selection

## train_test_split()

In [6]:
# 데이터 프레임의 시리즈를 넣어도 스플릿 가능 
iris_df = pd.DataFrame(data=iris.data, columns = iris.feature_names)
iris_df['label'] = iris.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [7]:
ftr_df = iris_df.iloc[:, :-1]
lab_se = iris_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(ftr_df, lab_se,
                                                   test_size=0.3, random_state=121)

In [8]:
print(type(X_train))
print(type(y_train))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [9]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
print(f'accuracy: {accuracy_score(y_test, pred):.4f}')

accuracy: 0.9556


In [10]:
dt_clf.get_depth()

5

## cross validation

### k-fold

In [11]:
from sklearn.model_selection import KFold
import numpy as np

iris = load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier()

print(f'data set shape: {features.shape[0]}')

data set shape: 150


In [12]:
kfold = KFold(n_splits=5)
kfold
for i, j in kfold.split(features): #  데스트셋과 검증용셋으로 나눠서 **인덱스**를 반환한다 
    print(f'train_index: {i[0]} ~ {i[-1]}')
    print(f'val_index: {j[0]} ~ {j[-1]}')

train_index: 30 ~ 149
val_index: 0 ~ 29
train_index: 0 ~ 149
val_index: 30 ~ 59
train_index: 0 ~ 149
val_index: 60 ~ 89
train_index: 0 ~ 149
val_index: 90 ~ 119
train_index: 0 ~ 119
val_index: 120 ~ 149


In [13]:
n_iter = 0
cv_accuracy = []
for train_idx, val_idx in kfold.split(features):
    X_train, X_val = features[train_idx], features[val_idx]
    y_train, y_val = label[train_idx], label[val_idx]
    
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_val)
    n_iter += 1
    
    accuracy = np.round(accuracy_score(y_val, pred), 4)
    print(f'{n_iter}, val_acc: {accuracy}')
    print(f'   depth: {dt_clf.get_depth()}')
    cv_accuracy.append(accuracy)

print(round(np.mean(cv_accuracy), 4))

1, val_acc: 1.0
   depth: 5
2, val_acc: 1.0
   depth: 5
3, val_acc: 0.9
   depth: 4
4, val_acc: 0.9333
   depth: 4
5, val_acc: 0.7333
   depth: 4
0.9133


### stratified K-fold

In [14]:
iris = load_iris()

iris_df = pd.DataFrame(data = iris.data, columns = iris.feature_names)
iris_df['label'] = iris.target
iris_df['label'].value_counts()

0    50
1    50
2    50
Name: label, dtype: int64

In [15]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3)
n_iter = 0

for train_idx, val_idx in skf.split(iris_df, iris_df['label']):
    n_iter += 1
    label_train = iris_df['label'].iloc[train_idx]
    label_val = iris_df['label'].iloc[val_idx]
    print(f'##{n_iter}')
    print(f'train label count :\n{label_train.value_counts()}')
    print(f'val label count :\n{label_val.value_counts()}')

##1
train label count :
2    34
0    33
1    33
Name: label, dtype: int64
val label count :
0    17
1    17
2    16
Name: label, dtype: int64
##2
train label count :
1    34
0    33
2    33
Name: label, dtype: int64
val label count :
0    17
2    17
1    16
Name: label, dtype: int64
##3
train label count :
0    34
1    33
2    33
Name: label, dtype: int64
val label count :
1    17
2    17
0    16
Name: label, dtype: int64


In [16]:
dt_clf = DecisionTreeClassifier(random_state = 156)

skfold = StratifiedKFold(n_splits=3)
n_iter = 0
cv_accuracy = []

for train_index, test_index in skfold.split(features, label):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    
    n_iter+= 1
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    
    print(n_iter)
    print('accuracy :', accuracy)
    print('train size: ', train_size)
    print('test size: ', test_size)
    cv_accuracy.append(accuracy)
    
print('fold accu: ', cv_accuracy)
print('mean: ', np.mean(cv_accuracy))

1
accuracy : 0.98
train size:  100
test size:  50
2
accuracy : 0.94
train size:  100
test size:  50
3
accuracy : 0.98
train size:  100
test size:  50
fold accu:  [0.98, 0.94, 0.98]
mean:  0.9666666666666667


### cross_val_score()

In [18]:
from sklearn.model_selection import cross_val_score, cross_validate

iris_data = load_iris()
dt_clf = DecisionTreeClassifier(random_state = 156)

data = iris_data.data
label = iris_data.target

scores = cross_val_score(dt_clf, data, label, 
                         scoring = 'accuracy', cv=3)
print(scores)
print(np.mean(scores))

[0.98 0.94 0.98]
0.9666666666666667


### GridSearchCV

In [24]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                   test_size=0.2, random_state=121)
dtree = DecisionTreeClassifier(random_state=121)
params = {'max_depth' : [1, 2, 3, 4, 5],
          'min_samples_split' : [2, 3]}

grid_dtree = GridSearchCV(dtree, param_grid=params, cv=3, 
                          refit=True, return_train_score=True)
grid_dtree.fit(X_train, y_train)

scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 
           'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,9,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,9,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,4,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,4,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95
6,"{'max_depth': 4, 'min_samples_split': 2}",0.95,6,0.925,1.0,0.925
7,"{'max_depth': 4, 'min_samples_split': 3}",0.95,6,0.925,1.0,0.925
8,"{'max_depth': 5, 'min_samples_split': 2}",0.966667,3,0.975,1.0,0.925
9,"{'max_depth': 5, 'min_samples_split': 3}",0.95,6,0.925,1.0,0.925


In [25]:
grid_dtree.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [26]:
grid_dtree.best_score_

0.975

In [27]:
pred = grid_dtree.predict(X_test)
accuracy_score(y_test, pred)

0.9666666666666667

In [34]:
dtree = DecisionTreeClassifier(max_depth= 3, min_samples_split= 2, random_state=121)
dtree.fit(X_train, y_train)
pred = dtree.predict(X_test)

accuracy_score(y_test, pred)

0.9666666666666667