# init

In [234]:
import sklearn
from sklearn.datasets import load_digits
from sklearn.datasets import load_wine
from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

print(pd.__version__)

1.4.4


# 1-11 load_digits

In [235]:
digits = load_digits()
digits.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

In [236]:
print(digits.data)
print(digits.data.shape)
digits_data = digits.data  # data
digitsdf = pd.DataFrame(data=digits.target)
digitsdf.value_counts().sort_index()

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]
(1797, 64)


0    178
1    182
2    177
3    183
4    181
5    182
6    181
7    179
8    174
9    180
dtype: int64

In [237]:
print(digits.target)
print(digits.target.shape)
print(digits.target_names)
print(digits.images.shape)

digits_label = digits.target # label

[0 1 2 ... 8 9 8]
(1797,)
[0 1 2 3 4 5 6 7 8 9]
(1797, 8, 8)


In [238]:
X_train, X_test, y_train, y_test = train_test_split(digits_data,
                                                    digits_label,
                                                    test_size=0.4,
                                                    random_state=67,
                                                   stratify = None)

## 1-11 DecisionTreeClassifier

In [239]:
import numpy as np
decision_digit = DecisionTreeClassifier(random_state=42)
decision_digit.fit(X_train, y_train)
y_pred = decision_digit.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[70,  0,  0,  0,  1,  0,  1,  0,  0,  0],
       [ 1, 65,  5,  1,  2,  0,  1,  1,  5,  1],
       [ 0,  1, 61,  0,  0,  1,  3,  1,  4,  0],
       [ 0,  0,  1, 51,  1,  1,  1,  0,  4,  6],
       [ 0,  8,  0,  1, 62,  0,  1,  3,  0,  2],
       [ 0,  0,  1,  3,  1, 64,  2,  2,  2,  0],
       [ 0,  0,  1,  0,  0,  1, 76,  0,  1,  0],
       [ 0,  0,  0,  1,  2,  1,  1, 50,  0,  1],
       [ 0,  3,  2,  5,  1,  3,  1,  1, 41,  2],
       [ 4,  2,  2,  4,  1,  1,  0,  2,  4, 63]], dtype=int64)

In [240]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95        72
           1       0.82      0.79      0.81        82
           2       0.84      0.86      0.85        71
           3       0.77      0.78      0.78        65
           4       0.87      0.81      0.84        77
           5       0.89      0.85      0.87        75
           6       0.87      0.96      0.92        79
           7       0.83      0.89      0.86        56
           8       0.67      0.69      0.68        59
           9       0.84      0.76      0.80        83

    accuracy                           0.84       719
   macro avg       0.83      0.84      0.84       719
weighted avg       0.84      0.84      0.84       719



## 1-11 RandomForestClassifier

In [241]:
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

In [242]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        72
           1       0.98      1.00      0.99        82
           2       0.99      1.00      0.99        71
           3       1.00      0.94      0.97        65
           4       0.97      0.96      0.97        77
           5       0.95      0.93      0.94        75
           6       0.97      0.99      0.98        79
           7       0.90      0.98      0.94        56
           8       0.90      0.92      0.91        59
           9       0.97      0.92      0.94        83

    accuracy                           0.96       719
   macro avg       0.96      0.96      0.96       719
weighted avg       0.96      0.96      0.96       719



## 1-11 SVM

In [243]:
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

In [244]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        72
           1       1.00      1.00      1.00        82
           2       1.00      1.00      1.00        71
           3       1.00      0.98      0.99        65
           4       1.00      0.96      0.98        77
           5       0.99      0.97      0.98        75
           6       0.99      1.00      0.99        79
           7       0.98      1.00      0.99        56
           8       0.92      0.98      0.95        59
           9       0.98      0.96      0.97        83

    accuracy                           0.99       719
   macro avg       0.99      0.99      0.99       719
weighted avg       0.99      0.99      0.99       719



## 1-11 SGD Classifier

In [245]:
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[72,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 81,  0,  0,  0,  0,  0,  0,  1,  0],
       [ 0,  0, 71,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 58,  0,  2,  0,  1,  2,  2],
       [ 0,  1,  0,  0, 74,  0,  0,  0,  1,  1],
       [ 0,  1,  0,  0,  0, 70,  1,  0,  1,  2],
       [ 0,  0,  0,  0,  0,  0, 79,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 54,  0,  2],
       [ 0,  2,  1,  1,  0,  2,  0,  0, 52,  1],
       [ 0,  2,  0,  0,  0,  0,  0,  0,  0, 81]], dtype=int64)

In [246]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        72
           1       0.93      0.99      0.96        82
           2       0.99      1.00      0.99        71
           3       0.98      0.89      0.94        65
           4       1.00      0.96      0.98        77
           5       0.95      0.93      0.94        75
           6       0.99      1.00      0.99        79
           7       0.98      0.96      0.97        56
           8       0.91      0.88      0.90        59
           9       0.91      0.98      0.94        83

    accuracy                           0.96       719
   macro avg       0.96      0.96      0.96       719
weighted avg       0.96      0.96      0.96       719



## 1-11 Logistic Regression

In [247]:
logistic_model = LogisticRegression(max_iter=1500)

In [248]:
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [249]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        72
           1       0.97      0.95      0.96        82
           2       0.97      0.99      0.98        71
           3       0.97      0.94      0.95        65
           4       1.00      0.96      0.98        77
           5       0.95      0.95      0.95        75
           6       0.98      1.00      0.99        79
           7       1.00      1.00      1.00        56
           8       0.82      0.95      0.88        59
           9       0.97      0.92      0.94        83

    accuracy                           0.96       719
   macro avg       0.96      0.96      0.96       719
weighted avg       0.97      0.96      0.96       719



## 평가
* 5가지의 모델로 load_digit를 학습시켰다. 학습의 평가를 위해 classification_report를 사용했다.


* max_iter는 warning 문구가 나오지 않을 때 까지 100씩 올렸다.


* 평가지표로 precision과 recall, f1-score가 있지만 precision과 recall의 조화평균을 계산한 f1-score를 선택했다.


* 조화평균이 산술평균과 다른점은 산술평균의 경우 극단적인 데이터에 영향을 받지만, 조화평균의 경우 두 데이터의 차이가
극단적일 경우 0에 수렴한 값을 내기 때문이다.


* SVM, RandomForestClassifier, LogisticRegression, SGDClassifier, DecisionTreeClassifier순으로 정확하다고 볼 수 있다.


* 상위 세가지 모델의 f1-score는 비슷하지만 SVM의 편차가 가장 적다. 숫자 분류에는 SVM이 적합하다.


* SGDClassifier의 경우 f1-score가 모델 특성상 값이 바뀌긴 하지만 평균적으로 0.9이상이 나왔다.


* 각 숫자마다 학습한 데이터의 개수가 비슷함에도 불구하고 5개 모델들 전부 5 ,8, 9에 대한 f1-score가 낮게 나왔다.


* SGDClassifier에서 8에 대한 precision과 recall의 값의 편차가 컸다.


# 1-12 load_wine

In [250]:
wine = load_wine()

In [251]:
wine_df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
wine_df['label']=wine.target

print(wine_df['label'].value_counts().sort_index())
wine_df

0    59
1    71
2    48
Name: label, dtype: int64


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,label
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [252]:
wine_data = wine.data
wine_label = wine.target

In [253]:
X_train, X_test, y_train, y_test = train_test_split(wine_data,
                                                    wine_label,
                                                    test_size=0.3,
                                                    random_state=67,
                                                   stratify = None)

## 1-12 DecisionClassifier

In [254]:
decision_wine = DecisionTreeClassifier(random_state=42)
decision_wine.fit(X_train, y_train)
y_pred = decision_wine.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[17,  0,  0],
       [ 1, 21,  2],
       [ 0,  1, 12]], dtype=int64)

In [255]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        17
           1       0.95      0.88      0.91        24
           2       0.86      0.92      0.89        13

    accuracy                           0.93        54
   macro avg       0.92      0.93      0.92        54
weighted avg       0.93      0.93      0.93        54



## 1-12 RandomForestClassifier

In [256]:
random_forest_wine = RandomForestClassifier(random_state=42)
random_forest_wine.fit(X_train, y_train)
y_pred = random_forest_wine.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[17,  0,  0],
       [ 0, 24,  0],
       [ 0,  0, 13]], dtype=int64)

In [257]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       1.00      1.00      1.00        24
           2       1.00      1.00      1.00        13

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54



## 1-12 SVM

In [258]:
svm_model_wine = svm.SVC()
svm_model_wine.fit(X_train, y_train)
y_pred = svm_model_wine.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[17,  0,  0],
       [ 0, 21,  3],
       [ 0, 10,  3]], dtype=int64)

In [259]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.68      0.88      0.76        24
           2       0.50      0.23      0.32        13

    accuracy                           0.76        54
   macro avg       0.73      0.70      0.69        54
weighted avg       0.74      0.76      0.73        54



## 1-12 SGD Classifier

In [260]:
sgd_model_wine = SGDClassifier()
sgd_model_wine.fit(X_train, y_train)
y_pred= sgd_model_wine.predict(X_test)
print(y_test)
print(y_pred)
print(np.unique(y_pred))
confusion_matrix(y_test, y_pred)

[1 2 1 2 1 1 2 2 1 0 2 0 1 0 0 0 0 2 1 0 1 2 1 1 0 0 0 1 0 2 2 1 1 1 0 1 2
 1 1 1 1 2 0 1 1 2 0 1 0 1 1 0 0 2]
[1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1 0 1 1 1 1 0 0 0 1 0 1 1 1 1 1 0 1 1
 1 1 1 1 0 0 1 1 1 0 1 0 1 1 0 0 1]
[0 1]


array([[17,  0,  0],
       [ 0, 24,  0],
       [ 1, 12,  0]], dtype=int64)

In [261]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        17
           1       0.67      1.00      0.80        24
           2       0.00      0.00      0.00        13

    accuracy                           0.76        54
   macro avg       0.54      0.67      0.59        54
weighted avg       0.59      0.76      0.66        54



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 1-12 Logistic Regression

In [262]:
logistic_model_wine = LogisticRegression(max_iter=100)
logistic_model_wine.fit(X_train, y_train)
y_pred = logistic_model_wine.predict(X_test)
confusion_matrix(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[16,  1,  0],
       [ 0, 24,  0],
       [ 0,  0, 13]], dtype=int64)

In [263]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        17
           1       0.96      1.00      0.98        24
           2       1.00      1.00      1.00        13

    accuracy                           0.98        54
   macro avg       0.99      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54



## 평가
* load_digit과 같은 모델과 classification_report를 사용했다.


* max_iter 역시 같은 방법으로 올렸다.


* f1-score를 평가지표로 사용했으며, LogisticRegression, RandomForestClassifier, DecisionTreeClassifier가 유의미한 분류를 했다.


* SVM의 경우 digits와 다르게 좋은 결과가 나오지 않았다. 아마도 feature의 값들의 차이가 크기 때문인 것 같다.   
Cost 값을 올릴수록 모델의 성능은 더 좋아졌다.

* SGDClassifier에서 두 번 돌리면 한 번 꼴로 warning 문구가 나왔다. precision 값이 0이여서 f1-score를 구하지 못했다고 한다.
y_pred 값을 보면 전체 라벨이 총 3개인데 2개밖에 없음을 알 수 있다.

* Logistic Regression의 max_iter값을 100부터 올려서 3200까지 올려봤지만 오분류 개수는 1개에서 변함 없었다.


# 1-13 load breast_cancer

In [264]:
breast_cancer = load_breast_cancer()

In [265]:
breast_cancerdf=pd.DataFrame(data = breast_cancer.data,columns = breast_cancer.feature_names)
breast_cancerdf['label']=breast_cancer.target
print(breast_cancer.target_names)
print(breast_cancerdf['label'].value_counts().sort_index())
breast_cancerdf

['malignant' 'benign']
0    212
1    357
Name: label, dtype: int64


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [266]:
breast_cancer_data = breast_cancer.data
breast_cancer_label = breast_cancer.target

In [267]:
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data,
                                                    breast_cancer_label,
                                                    test_size=0.2,
                                                    random_state=67)

## 1-13 DecisionTreeClassifier

In [268]:
decision_breast_cancer = DecisionTreeClassifier(random_state=42)
decision_breast_cancer.fit(X_train, y_train)
y_pred = decision_breast_cancer.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[33,  5],
       [ 4, 72]], dtype=int64)

In [269]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88        38
           1       0.94      0.95      0.94        76

    accuracy                           0.92       114
   macro avg       0.91      0.91      0.91       114
weighted avg       0.92      0.92      0.92       114



## 1-13 RandomForestClassifier

In [270]:
rf_breast_cancer= RandomForestClassifier(random_state=42)
rf_breast_cancer.fit(X_train, y_train)
y_pred = rf_breast_cancer.predict(X_test)

In [271]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        38
           1       0.96      1.00      0.98        76

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114



## 1-13 SVM

In [272]:
svm_breast_cancer = svm.SVC()
svm_breast_cancer.fit(X_train, y_train)
y_pred = svm_breast_cancer.predict(X_test)

In [273]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.74      0.84        38
           1       0.88      0.99      0.93        76

    accuracy                           0.90       114
   macro avg       0.92      0.86      0.88       114
weighted avg       0.91      0.90      0.90       114



## 1-13 SGD Classifier

In [274]:
sgdc_breast_cancer = SGDClassifier()
sgdc_breast_cancer.fit(X_train, y_train)
y_pred = sgdc_breast_cancer.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[34,  4],
       [11, 65]], dtype=int64)

In [275]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.89      0.82        38
           1       0.94      0.86      0.90        76

    accuracy                           0.87       114
   macro avg       0.85      0.88      0.86       114
weighted avg       0.88      0.87      0.87       114



## 1-13 Logistic Regression

In [276]:
lr_breast_cancer= LogisticRegression(max_iter=430)
lr_breast_cancer.fit(X_train, y_train)
y_pred = lr_breast_cancer.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[33,  5],
       [ 1, 75]], dtype=int64)

In [277]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.87      0.92        38
           1       0.94      0.99      0.96        76

    accuracy                           0.95       114
   macro avg       0.95      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



## 평가
* load_digit과 같은 모델과 classification_report를 사용했다.


* max_iter 역시 같은 방법으로 올렸다.


* load_breast_cancer의 타겟은 0,1로 악성과 양성을 나타낸다. 악성을 양성으로 진단하는 것은 매우 치명적이기 때문에
악성(y_test = 0)에 대한 recall 값을 평가지표로 사용했다.

* RandomForestClassifier, LogisticRegression, DecisionTreeClassifier 순으로 높았으며, SVM은 recall값이 낮고, SGDClassifier는 값의
편차가 커서 적합하지 않다고 판단했다.


# 마치며

* DecisionTreeClassifier는 전반적으로 나쁘지 않은 결과를 보여주었다. 그래서 애매한 모델이라고 생각이 들었다.
하지만 디폴트로 진행했다는 점, feature 값들이 많아서(?) 과적합에 빠졌다고 생각이 들었다. 적당한 튜닝이 들어간다면 
우수한 결과를 기대할 수 있다고 생각한다.   


* RandomForestClassifier는 3개의 데이터에서 우수한 성능을 보였다. ensemble method를 통해 과적합을 방지하고 많은 feature들을
다룰 수 있다. DecisionTreeClassifier의 단점을 극복한 개념이기에 당연히 모든 데이터에서 DecisionTreeClassifier 보다 우수한 
학습률을 보였다.   


* SVM은 digits의 성능에서 매우 우수하게 분류 했지만, 다른 데이터에선 그러지 못했다. 그래서 다른 데이터들의 Cost 값(default=1)을 
10배씩 10000까지 올려주었는데 accuracy가 0.95까지 올라갔었다. 이유를 생각해본다면 Cost를 올려주면 결정경계가 선형에서 비선형으로
바뀌는데 이를 통해 여러 차원에 존재하는 값들을 잘 분리할 수 있었던 거 같다.   


* SGDClassifier는 fit 할 때마다 편차가 컸는데 SGDClassifier는 전체 데이터들 중 무작위 학습으로 경사를 줄여나가는데,
digits에 비해 두 데이터는 데이터의 양이 적기 때문에 편차가 크게 나타난다고 생각한다. wine에서 y_pred 값을 보면 3개의 라벨 중 
2개 밖에 학습을 하지 못한 것을 알 수 있다. 데이터가 적을 땐 다른 모델을 쓰거나 튜닝이 필요할 것 같다고 생각했다.   


* LogisticRegression은 전반적으로 우수한 성능을 보였다. LogisticRegression은 이진분류에선 sigmoid 다중 분류에서 softmax함수를 사용한다. softmax를 통해 높은 확률을 가진 데이터를 클래스에 넣는데, softmax에서의 단점인 단조증가에 의해서 feature값들을 정규화 해주지 않는다면 성능이 떨어질 것이다. 이진분류에서 강한 LogisticRegression이 load_breast에서 RandomForestClassifier보다 정확도가 떨어진 이유일 듯 싶다.   