In [1]:
from sklearn.datasets import fetch_openml

# MNIST 데이터 불러오기
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [2]:
import numpy as np

X, y = mnist['data'], mnist['target']
y = y.astype(np.int8)
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [9]:
"""이진 분류기"""

# '5' 인지 아닌지 분류
from sklearn.linear_model import SGDClassifier
y_train_5 = (y_train == 5)  # 5는 True고 나머지는 다 False
y_test_5 = (y_test == 5)

model = SGDClassifier(random_state=42)
model.fit(X_train, y_train_5)

some_digit = X.iloc[0]
model.predict([some_digit])

array([ True])

In [5]:
# k겹 교차 검증
from sklearn.model_selection import cross_val_score

cross_val_score(model, X_train, y_train_5, cv=3, scoring='accuracy')

array([0.95035, 0.96035, 0.9604 ])

In [6]:
# 오차 행렬을 통한 성능 평가
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_train_pred = cross_val_predict(model, X_train, y_train_5, cv=3)
confusion_matrix(y_train_5, y_train_pred)

array([[53892,   687],
       [ 1891,  3530]], dtype=int64)

In [7]:
from sklearn.metrics import precision_score, recall_score

print(precision_score(y_train_5, y_train_pred))  # 정밀도
print(recall_score(y_train_5, y_train_pred))  # 재현율
# 재현율을 올리면 정밀도가 떨어지고 정밀도를 올리면 재현율이 떨어짐. 정밀도/재현율 트레이드오프

0.8370879772350012
0.6511713705958311


In [8]:
from sklearn.metrics import precision_recall_curve

# 모든 임곗값에 대해 정밀도와 재현율 계산
y_scores = cross_val_predict(model, X_train, y_train_5, cv=3, method='decision_function')
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

# 정밀도 90%를 달성하는 것이 목표
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]
y_train_pred_90 = (y_scores >= threshold_90_precision)

# 새로운 분류기에 대한 정밀도와 재현율
print(precision_score(y_train_5, y_train_pred_90))
print(recall_score(y_train_5, y_train_pred_90))

0.9000345901072293
0.4799852425751706


In [10]:
"""다중 분류"""

""" OvO 전략: 10개의 이진 분류기를 0에서 9까지 10개의 이진 분류기를 
훈련시키고 각각의 결정 점수를 얻어 가장 점수가 높은 클래스를 선택 """
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)
print(model.predict([some_digit]))

some_digit_scores = model.decision_function([some_digit])
some_digit_scores

[5]


array([[ 1.72501977,  2.72809088,  7.2510018 ,  8.3076379 , -0.31087254,
         9.3132482 ,  1.70975103,  2.76765202,  6.23049537,  4.84771048]])

In [13]:
"""다중 레이블 분류"""

# 큰 값인지, 홀수인지를 분류하는 분류기
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

model = KNeighborsClassifier()
model.fit(X_train, y_multilabel)
model.predict([some_digit])

array([[False,  True]])