In [18]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

# 데이터 로드 및 이해
wine = load_wine()
wine_data = wine.data
wine_label = wine.target

print(wine.target_names)
print(wine.DESCR)

['class_0' 'class_1' 'class_2']
.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
   

In [19]:
# 분류하기

X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2, 
                                                    random_state=5)

In [20]:
#1 decision tree 사용
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.76      0.87        17
           1       0.73      1.00      0.85        11
           2       0.88      0.88      0.88         8

    accuracy                           0.86        36
   macro avg       0.87      0.88      0.86        36
weighted avg       0.89      0.86      0.86        36



In [5]:
# 2 random forest 사용

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [6]:
# 3 SVM 사용

from sklearn import svm
svm_model = svm.SVC()

print(svm_model._estimator_type)
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print(classification_report(y_test, y_pred))

classifier
              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.58      0.88      0.70        17
           2       0.33      0.08      0.13        12

    accuracy                           0.61        36
   macro avg       0.59      0.61      0.56        36
weighted avg       0.55      0.61      0.54        36



In [21]:
#4 SGD Classifier 모델

from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()

sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      1.00      0.72        17
           1       0.67      0.36      0.47        11
           2       0.00      0.00      0.00         8

    accuracy                           0.58        36
   macro avg       0.41      0.45      0.40        36
weighted avg       0.47      0.58      0.49        36



  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# 로지스틱회귀모델

from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.86      0.92         7
           1       0.94      1.00      0.97        17
           2       1.00      1.00      1.00        12

    accuracy                           0.97        36
   macro avg       0.98      0.95      0.96        36
weighted avg       0.97      0.97      0.97        36



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 평가



중요한 평가 요인으로 f1-score을 선택했다. 소믈리에 입장을 가정했을 때 와인에 대해 맞는 것을 틀렸다고 하는 경우와 틀린 것을 맞았다고 하는 경우 모두 동등한 수준의 문제를 가지고 있기 때문에 해당 수치의 조화 평균인 f1-score의 macro avg를 기준으로 평가했다. 
좋은 성능은 랜덤 포레스트, 로지스틱 회귀모델, 의사결정나무, SVM, SGD 순서이다. 그러나 SGD와 SVM은 다른 모델에 비해 성능이 크게 떨어진다.