# 와인 분류해보기

## 1. 필요한 모듈 import 하기

In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

---

## 2. 데이터 준비

In [2]:
wine = load_wine()

In [3]:
print(dir(wine))

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']


In [4]:
wine.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

---

## 3. 데이터 이해하기

#### Feature Data 지정하기

In [5]:
wine_data = wine.data

In [6]:
print(wine_data.shape)

(178, 13)


In [7]:
wine_data[0]

array([1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
       3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
       1.065e+03])

In [16]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

#### Label Data 지정하기

In [8]:
wine_label = wine.target

In [9]:
print(wine_label.shape)

(178,)


In [14]:
wine_label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

#### Target Names 출력해 보기

In [15]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

#### 데이터 Describe 해 보기

In [17]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

---

## 4. train, test 데이터 분리

In [43]:
X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2, 
                                                    random_state=30)

print('X_train 개수: ', len(X_train),', X_test 개수: ', len(X_test))

X_train 개수:  142 , X_test 개수:  36


---

## 5. 다양한 모델로 학습시켜보기

### 1) Decision Tree

In [44]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.82      0.78        11
           1       0.81      0.81      0.81        16
           2       0.88      0.78      0.82         9

    accuracy                           0.81        36
   macro avg       0.81      0.80      0.81        36
weighted avg       0.81      0.81      0.81        36



### 2) Random Forest

In [45]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.94      0.97        16
           2       1.00      1.00      1.00         9

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.97        36
weighted avg       0.97      0.97      0.97        36



### 3) Support Vector Machine (SVM)

In [46]:
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.73      0.80        11
           1       0.79      0.69      0.73        16
           2       0.46      0.67      0.55         9

    accuracy                           0.69        36
   macro avg       0.71      0.69      0.69        36
weighted avg       0.74      0.69      0.71        36



### 4) Stochastic Gradient Descent Classifier (SGDClassifier)

In [47]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(max_iter=5000)
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.73      0.84        11
           1       1.00      0.06      0.12        16
           2       0.33      1.00      0.50         9

    accuracy                           0.50        36
   macro avg       0.78      0.60      0.49        36
weighted avg       0.83      0.50      0.43        36



### 5) Logistic Regression

In [48]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter=5000)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.94      0.97        16
           2       1.00      1.00      1.00         9

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.97        36
weighted avg       0.97      0.97      0.97        36



---

## 6. 모델을 평가해보기

와인을 분류하는 문제도 마찬가지로 target의 개수가 3개이고, 그에 따라 와인이 어느 클래스에 속할지를 판단하는 문제이기 때문에

accuracy로 모델의 성능을 평가해야하는 것 같다. 이 중에 제일 성능이 좋은 모델은 97%의 accuracy를 가진 Random Forest와 Logistic Regression이다.

아무래도 고려해야할 feature의 수가 꽤 많은 편에 속하고, 각 클래스로 분류될 때 그 경계를 판단하기가 어려운 feature들이 많아서

SVM과 SGD의 성능은 떨어지는 것 같다. 두 모델에서의 precision과 recall을 살펴봤을 때 판단 정확도가 낮은 것 같다. 근데 전체적으로 dataset의 수가 적어서 판단하기가 좀 어렵긴 하다.

이번엔 SGD에서도 warning이 뜨고 너무 값이 이상하게 나와서 max_iter를 부여했더니 warning은 사라졌다.