# Wine Classifer
---
#### 주요 모듈 import

In [27]:
from sklearn.datasets import load_wine # load wine data
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix # Use confusion_matrix
from sklearn.tree import DecisionTreeClassifier # Use DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier # Use RandomForestClassifier
from sklearn import svm # Use Support Vector Machine(SVM)
from sklearn.linear_model import SGDClassifier # Use Stochastic Gradient Descent Classifier (SGDClassifier)
from sklearn.linear_model import LogisticRegression # Use LogisticRegression
from sklearn.metrics import accuracy_score # 정확도 확인
import pandas as pd

## 데이터 이해하기
---
- Feature Data 지정하기
- Label Data 지정하기
- Target Names 출력해 보기
- 데이터 Describe 해 보기

In [21]:
wine = load_wine() #Load Data
print(wine.keys()) #wine Data
wine_data = wine.data
print("wine_data.shape: {}".format(wine_data.shape)) 
wine_label = wine.target
print("wine_label.shape: {}".format(wine_label.shape))
print("wine.target_names: {}".format(wine.target_names))
#print(wine.DESCR) #정보 출력
print(wine.feature_names)
wine_df = pd.DataFrame(data=wine_data, columns=wine.feature_names)
wine_df["label"] = wine_label
print(wine_df)

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])
wine_data.shape: (178, 13)
wine_label.shape: (178,)
wine.target_names: ['class_0' 'class_1' 'class_2']
['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0      14.23        1.71  2.43               15.6      127.0           2.80   
1      13.20        1.78  2.14               11.2      100.0           2.65   
2      13.16        2.36  2.67               18.6      101.0           2.80   
3      14.37        1.95  2.50               16.8      113.0           3.85   
4      13.24        2.59  2.87               21.0      118.0           2.80   
..       ...         ...   ...                ...        ...            ...   
173    13.71        5.65  2.45               20.5    

## train,test 데이터 분리
---
데이터 분리를 위해 sklearn.model_selection 사용, 추가로 shuffle 속성을 True로 함

In [22]:
X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2,
                                                    random_state=5,
                                                    shuffle=True)

## 다양한 모델로 학습시켜보기
---
- Decision Tree 사용해 보기
- Random Forest 사용해 보기
- SVM 사용해 보기
- SGD Classifier 사용해 보기
- Logistic Regression 사용해 보기

In [25]:
# Use DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred_DC = decision_tree.predict(X_test)
DC_accuary = accuracy_score(y_test,y_pred_DC)
print("DecisionTree accuarcy: {}".format(DC_accuary))

# Use RandomForestClassifier
random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred_RF = random_forest.predict(X_test)
RF_accuary = accuracy_score(y_test,y_pred_RF)
print("RandomForestClassifier accuarcy: {}".format(RF_accuary))

# Use Support Vector Machine(SVM)
clf = svm.SVC(random_state=32)
clf.fit(X_train, y_train)
y_pred_SVM = clf.predict(X_test)
SVM_accuary = accuracy_score(y_test,y_pred_SVM)
print("Support Vector Machine accuarcy: {}".format(SVM_accuary))

# Use Stochastic Gradient Descent Classifier (SGDClassifier)
clf = SGDClassifier(loss="perceptron").fit(X_train, y_train)
y_pred_SGDC = clf.predict(X_test)
SGDC_accuary = accuracy_score(y_test,y_pred_SGDC)
print("SGDClassifier accuarcy: {}".format(SGDC_accuary))

# Use LogisticRegression
logistic_model = LogisticRegression(max_iter=3000)
clf = logistic_model.fit(X_train, y_train)
y_pred_LOGR = clf.predict(X_test)
LOGR_accuary = accuracy_score(y_test,y_pred_LOGR)
print("LogisticRegression accuarcy: {}".format(LOGR_accuary))

DecisionTree accuarcy: 0.8611111111111112
RandomForestClassifier accuarcy: 1.0
Support Vector Machine accuarcy: 0.6944444444444444
SGDClassifier accuarcy: 0.5833333333333334
LogisticRegression accuarcy: 0.9444444444444444


## Using confusion_matrix 

In [26]:
print("Use DecisionTreeClassifier")
print(classification_report(y_test, y_pred_DC))
print("Use RandomForestClassifier")
print(classification_report(y_test, y_pred_RF))
print("Use Support Vector Machine(SVM)")
print(classification_report(y_test, y_pred_SVM))
print("Use Stochastic Gradient Descent Classifier (SGDClassifier)")
print(classification_report(y_test, y_pred_SGDC))
print("Use LogisticRegression")
print(classification_report(y_test, y_pred_LOGR))

Use DecisionTreeClassifier
              precision    recall  f1-score   support

           0       1.00      0.76      0.87        17
           1       0.73      1.00      0.85        11
           2       0.88      0.88      0.88         8

    accuracy                           0.86        36
   macro avg       0.87      0.88      0.86        36
weighted avg       0.89      0.86      0.86        36

Use RandomForestClassifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

Use Support Vector Machine(SVM)
              precision    recall  f1-score   support

           0       0.86      0.71      0.77        17
           1       0.69      0.82      0.75

# Model evaluation
---
Wine 분류기의 경우 precsion , recall 의 중요성이 명확하게 드러나지 않는다고 생각합니다

Wine 분류기에선 각 모델마다 편차가 꽤 심한 편인데 그중에 RandomForeset와 LogisticRegression의 Accuracy가 

높게 나왔음을 알 수 있습니다. 특히 RandomForeset는 100%의 놀라운 Accuracy를 보이고 있는데,

이는 RandomForest 특성상 각각의 특성들을 Random하게 모두 비교하기 때문에 13가지의 특성을 가지고 있는 wine

data를 분류하는데에 적합했다고 생각합니다.

cf.각각의 모델의 원리를 아직은 정확하게 알지 못하지만 이해하는데로 평가Comment를 추가로 업로드 하겠습니다.