# 와인 분류하기

## 1. 필요한 모듈 import

In [55]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import time

def get_item_count(li):
  new_list = {}
  for i in li:
    try: new_list[i] += 1
    except: new_list[i] = 1
  return(new_list)

## 2. 데이터 준비

In [56]:
wine = load_wine()

## 3. 데이터 확인

In [57]:
print(wine.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])


In [59]:
print(wine.DESCR)
wine_data = wine.data
wine_label = wine.target
wine_feature = wine.feature_names
print(wine.target_names)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

## 4. 훈련/테스트 데이터 분리

In [60]:
X_train, X_test, y_train, y_test = train_test_split(wine_data,
                                                    wine_label,
                                                    test_size=0.2,
                                                    random_state=12)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(142, 13) (36, 13) (142,) (36,)


## 5. 학습

### 5.1. Decision Tree

In [61]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
start = time.time()
tree.fit(X_train, y_train)
print("학습소요시간: %.3f초" % (time.time() - start))
y_pred = tree.predict(X_test)

print(classification_report(y_test, y_pred))

학습소요시간: 0.011초
              precision    recall  f1-score   support

           0       0.82      1.00      0.90        14
           1       1.00      0.91      0.95        11
           2       1.00      0.82      0.90        11

    accuracy                           0.92        36
   macro avg       0.94      0.91      0.92        36
weighted avg       0.93      0.92      0.92        36



### 5.2. Random Forest

In [62]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()
start = time.time()
forest.fit(X_train, y_train)
print("학습소요시간: %.3f초" % (time.time() - start))
y_pred = forest.predict(X_test)

print(classification_report(y_test, y_pred))

학습소요시간: 0.150초
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.91      0.95        11
           2       0.92      1.00      0.96        11

    accuracy                           0.97        36
   macro avg       0.97      0.97      0.97        36
weighted avg       0.97      0.97      0.97        36



### 5.3. SVM

In [68]:
from sklearn.svm import SVC

svc = SVC()
start = time.time()
svc.fit(X_train, y_train)
print("학습소요시간: %.3f초" % (time.time() - start))
y_pred = svc.predict(X_test)
print(y_pred)
print(classification_report(y_test, y_pred))

학습소요시간: 0.006초
[1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 2 1 0 0 1 1 0 0 1 1 1 2 0 1 0 1 0 1]
              precision    recall  f1-score   support

           0       1.00      0.86      0.92        14
           1       0.50      1.00      0.67        11
           2       0.00      0.00      0.00        11

    accuracy                           0.64        36
   macro avg       0.50      0.62      0.53        36
weighted avg       0.54      0.64      0.56        36



### 5.4. SGD Classifier

In [69]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
start = time.time()
sgd.fit(X_train, y_train)
print("학습소요시간: %.3f초" % (time.time() - start))
y_pred = sgd.predict(X_test)

print(classification_report(y_test, y_pred))

학습소요시간: 0.014초
              precision    recall  f1-score   support

           0       1.00      0.86      0.92        14
           1       0.00      0.00      0.00        11
           2       0.46      1.00      0.63        11

    accuracy                           0.64        36
   macro avg       0.49      0.62      0.52        36
weighted avg       0.53      0.64      0.55        36



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 5.4. Logistic Regression

In [66]:
from sklearn.linear_model import LogisticRegression

# max_iter : 해를 찾아가기위해 반복하는 횟수를 제한
logistic = LogisticRegression(max_iter=5000)
start = time.time()
logistic.fit(X_train, y_train)
print("학습소요시간: %.3f초" % (time.time() - start))
y_pred = logistic.predict(X_test)

print(classification_report(y_test, y_pred))

학습소요시간: 0.941초
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.91      0.95        11
           2       0.92      1.00      0.96        11

    accuracy                           0.97        36
   macro avg       0.97      0.97      0.97        36
weighted avg       0.97      0.97      0.97        36



## 6. 결과

### 6.1. 모델 평가
> 1. 클래스별로 훈련에 사용된 데이터의 갯수가 많이 차이나고,
2. 실제로 class2 와인(N)이 class1와인(P)에 섞이지 않는 것이 중요하기 때문에
3. $실제 True를\ True로\ 예측한\ 수 (TP) \over 전체\ True로\ 예측한 수(TP+FP)$
4. *Precision*을 평가지표로 사용했다

* Random Forest와 LogisticRegression은 0.97의 높은 정밀도를 가졌고 둘 중 Random Forest의 눈에 띄게 속도가 빨랐다
* Decision Tree는 속도가 매우 빠르고 0.9 정도의 괜찮은 정밀도를 보여줬다.
* SVM와 SGD는 정밀도가 0.6~0.7 정도로 낮았다.
* SVM와 SGD는 가끔 특정 틀래스를 전혀 분류해내지 못했다