In [101]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd

### 손글씨 분류하기

In [102]:
digits = load_digits()
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [103]:
digits_data = digits.data
digits_label = digits.target

In [104]:
print(digits_data.shape)
print(digits_label.shape)

(1797, 64)
(1797,)


In [105]:
print(digits.target_names)

[0 1 2 3 4 5 6 7 8 9]


In [106]:
digits_df = pd.DataFrame(data=digits_data, columns=digits.feature_names)
digits_df.describe()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
count,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,...,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0
mean,0.0,0.30384,5.204786,11.835838,11.84808,5.781859,1.36227,0.129661,0.005565,1.993879,...,3.725097,0.206455,0.000556,0.279354,5.557596,12.089037,11.809126,6.764051,2.067891,0.364496
std,0.0,0.907192,4.754826,4.248842,4.287388,5.666418,3.325775,1.037383,0.094222,3.19616,...,4.919406,0.984401,0.02359,0.934302,5.103019,4.374694,4.933947,5.900623,4.090548,1.860122
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,11.0,10.0,0.0,0.0,0.0
50%,0.0,0.0,4.0,13.0,13.0,4.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,4.0,13.0,14.0,6.0,0.0,0.0
75%,0.0,0.0,9.0,15.0,15.0,11.0,0.0,0.0,0.0,3.0,...,7.0,0.0,0.0,0.0,10.0,16.0,16.0,12.0,2.0,0.0
max,0.0,8.0,16.0,16.0,16.0,16.0,16.0,15.0,2.0,16.0,...,16.0,13.0,1.0,9.0,16.0,16.0,16.0,16.0,16.0,16.0


In [107]:
train_input, test_input, train_target, test_target = train_test_split(digits.data, digits.target, test_size=0.2)
print("train_input 갯수 :", len(train_input), "test_input 갯수 :", len(test_input))

train_input 갯수 : 1437 test_input 갯수 : 360


##### 손 글씨 모델에서는 0의 존재 때문에 정확도는 믿을수 없다. 해서, 정밀도(Precision)와 재현율(Recall)의 종합점수인 f1-score를 사용해서 평가함.

In [108]:
# 의사결정나무
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(train_input, train_target)
test_pred = decision_tree.predict(test_input)

f1 = f1_score(test_target, test_pred, average='macro')
print("F1 점수:", f1)

F1 점수: 0.8311546796496403


In [109]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()
random_forest.fit(train_input, train_target)
test_pred2 = random_forest.predict(test_input)

f1 = f1_score(test_target, test_pred2, average='macro')
print("F1 점수:", f1)

F1 점수: 0.9697864585304792


In [110]:
# SVM
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(train_input, train_target)
test_pred3 = svm_model.predict(test_input)

f1 = f1_score(test_target, test_pred3, average='macro')
print("F1 점수:", f1)

F1 점수: 0.9823855592719808


In [111]:
# SGDClassfier
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier()
sgd_model.fit(train_input, train_target)
test_pred4 = sgd_model.predict(test_input)

f1 = f1_score(test_target, test_pred4, average='macro')
print("F1 점수:", f1)

F1 점수: 0.9105994355702343


In [112]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter=2000) # 반복 횟수 증가 및 제한
logistic_model.fit(train_input, train_target)
test_pred5 = logistic_model.predict(test_input)

f1 = f1_score(test_target, test_pred5, average='macro')
print("F1 점수:", f1)

F1 점수: 0.9445529589573814
