# 과제 - load_digits

In [2]:
import numpy as np
import pandas as pd

### 1. 결정트리

In [3]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [4]:
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(
    digits.data, digits.target, test_size=0.2, random_state=2021
)

In [5]:
dt_clf = DecisionTreeClassifier(random_state=2021)
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

In [6]:
from sklearn.metrics import accuracy_score
pred = dt_clf.predict(X_test)
accuracy_score(y_test, pred)

0.8722222222222222

In [7]:
# depth 조절

dt_clf2 = DecisionTreeClassifier(max_depth=10, random_state=2021)
dt_clf2.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=10, random_state=2021)

In [9]:
pred2 = dt_clf2.predict(X_test)
accuracy_score(y_test, pred2)

0.8638888888888889

In [10]:
# min_samples_split 조절

dt_clf3 = DecisionTreeClassifier(min_samples_split=4, random_state=2021)
dt_clf3.fit(X_train, y_train)

DecisionTreeClassifier(min_samples_split=4, random_state=2021)

In [11]:
pred3 = dt_clf3.predict(X_test)
accuracy_score(y_test, pred2)

0.8638888888888889

In [84]:
# GridSearchCV

from sklearn.model_selection import GridSearchCV
parameters = {
    'max_depth':[13,14,15],
    'min_samples_leaf':[2,3,4],
    'min_samples_split':[2,3,4,5] 
}
grid_dclf = GridSearchCV(dt_clf, param_grid=parameters, 
                         scoring='accuracy', cv=5)
grid_dclf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [13, 14, 15],
                         'min_samples_leaf': [2, 3, 4],
                         'min_samples_split': [2, 3, 4, 5]},
             scoring='accuracy')

In [85]:
grid_dclf.best_params_

{'max_depth': 13, 'min_samples_leaf': 3, 'min_samples_split': 2}

In [81]:
print(f'GridSearchCV 최고 정확도: {grid_dclf.best_score_:.4f}')

GridSearchCV 최고 정확도: 0.8344


In [16]:
# 파라메터 조절

from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [17]:
from sklearn.metrics import accuracy_score
pred = svc.predict(X_test)
accuracy_score(y_test, pred)

0.9777777777777777

In [18]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [19]:
params = {'C':[0.01, 0.1, 0.5, 1, 5, 10,100 ]}

In [20]:
from sklearn.model_selection import GridSearchCV

svc = SVC(random_state=2021)
grid_clf = GridSearchCV(svc, param_grid=params, cv=5,scoring = 'accuracy')

In [21]:
grid_clf.fit(X_train, y_train)
grid_clf.best_score_

0.9909504452187379

In [22]:
grid_clf.best_params_

{'C': 5}

In [23]:
best_clf = grid_clf.best_estimator_
pred = best_clf.predict(X_test)
accuracy_score(y_test, pred)

0.9833333333333333

In [None]:
# decisiontree로 내가 얻을 수 있는 최대 정확도 : 0.9833333333333333

### 2. 앙상블 학습

In [31]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
digits_scaled = scaler.fit_transform(digits.data)

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    digits_scaled, digits.target, stratify=digits.target,
    test_size=0.2, random_state=2021
)

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [34]:
lr = LogisticRegression(max_iter=1000)
svc = SVC()
knn = KNeighborsClassifier()

In [35]:
from sklearn.ensemble import VotingClassifier

vo_clf = VotingClassifier(
    estimators=[('LR', lr), ('SVC', svc), ('KNN', knn)],
    voting='hard'
)

In [36]:
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9861111111111112

In [38]:
for classifier in [lr, svc, knn]:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    acc = accuracy_score(y_test, pred)
    classifer_name = classifier.__class__.__name__
    print(f'{classifer_name} 정확도 : {acc:4f}')

LogisticRegression 정확도 : 0.958333
SVC 정확도 : 0.986111
KNeighborsClassifier 정확도 : 0.983333


In [39]:
vo_clf = VotingClassifier(
    estimators=[('LR', lr), ('KNN', knn)],
    voting='soft'
)
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
accuracy_score(y_test, pred)

0.9805555555555555

In [47]:
# GridSearchCV

from sklearn.model_selection import GridSearchCV
parameters = {
    'max_depth':[6,7,8,],
    'min_samples_leaf':[3,4],
    'min_samples_split':[2,3] 
}
grid_dclf = GridSearchCV(dt_clf, param_grid=parameters, 
                         scoring='accuracy', cv=5)
grid_dclf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [6, 7, 8], 'min_samples_leaf': [3, 4],
                         'min_samples_split': [2, 3]},
             scoring='accuracy')

In [48]:
grid_dclf.best_params_

{'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 2}

In [49]:
print(f'GridSearchCV 최고 정확도: {grid_dclf.best_score_:.4f}')

GridSearchCV 최고 정확도: 0.8260


In [50]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [51]:
from sklearn.metrics import accuracy_score
pred = svc.predict(X_test)
accuracy_score(y_test, pred)

0.9861111111111112

In [52]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [61]:
params = {'C':[0.1, 0.5, 1, 5, 10,100 ]}

In [62]:
from sklearn.model_selection import GridSearchCV

svc = SVC(random_state=2021)
grid_clf = GridSearchCV(svc, param_grid=params, cv=5,scoring = 'accuracy')

In [63]:
grid_clf.fit(X_train, y_train)
grid_clf.best_score_

0.9916448896631824

In [64]:
grid_clf.best_params_

{'C': 10}

In [66]:
best_clf = grid_clf.best_estimator_
pred = best_clf.predict(X_test)
accuracy_score(y_test, pred)

0.9777777777777777

In [97]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1
)
bag_clf.fit(X_train, y_train)
pred = bag_clf.predict(X_test)
accuracy_score(y_test,pred)

0.925

In [None]:
# 앙상블 학습으로 내가 얻을 수 있는 최대 정확도 0.9861111111111112

### 3.앙상블 - 랜덤 포레스트

In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    digits_scaled, digits.target, stratify=digits.target,
    test_size=0.2, random_state=2021
)

In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [57]:
rf_clf = RandomForestClassifier(random_state=2021)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
accuracy_score(y_test, pred)

0.9611111111111111

In [58]:
# GridSearchCV

from sklearn.model_selection import GridSearchCV
parameters = {
    'max_depth':[12,13,14],
    'min_samples_leaf':[3,4],
    'min_samples_split':[2,3] 
}
grid_dclf = GridSearchCV(dt_clf, param_grid=parameters, 
                         scoring='accuracy', cv=5)
grid_dclf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [12, 13, 14], 'min_samples_leaf': [3, 4],
                         'min_samples_split': [2, 3]},
             scoring='accuracy')

In [59]:
grid_dclf.best_params_

{'max_depth': 13, 'min_samples_leaf': 3, 'min_samples_split': 2}

In [60]:
print(f'GridSearchCV 최고 정확도: {grid_dclf.best_score_:.4f}')

GridSearchCV 최고 정확도: 0.8344


In [160]:
# 하이퍼 파라매터 조정

from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [161]:
from sklearn.metrics import accuracy_score
pred = svc.predict(X_test)
accuracy_score(y_test, pred)

0.9861111111111112

In [162]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [178]:
params = {'C':[1,10,100]}

In [179]:
from sklearn.model_selection import GridSearchCV

svc = SVC(random_state=2021)
grid_clf = GridSearchCV(svc, param_grid=params, cv=5,scoring = 'accuracy')

In [180]:
grid_clf.fit(X_train, y_train)
grid_clf.best_score_

0.9916448896631824

In [181]:
grid_clf.best_params_

{'C': 10}

In [182]:
best_clf = grid_clf.best_estimator_
pred = best_clf.predict(X_test)
accuracy_score(y_test, pred)

0.9777777777777777

In [None]:
# 랜덤 포레스트로 내가 얻을 수 있는 최대 정확도 
0.9861111111111112