In [2]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
wine = load_wine()
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [None]:
dtclf = DecisionTreeClassifier()

data = wine.data
label = wine.target

In [None]:
print('feature: ', data.shape) 
print('label : ', label.shape)

print('target: ', wine.target_names)

feature:  (178, 13)
label :  (178,)
target:  ['class_0' 'class_1' 'class_2']


In [None]:
# 특성을 7:3으로 나눔
xtrain, xtest, ytrain, ytest = train_test_split(data, label, test_size=0.3, random_state=2201061230)

print('xtrain의 수: ', len(xtrain), 'xtest의 수: ', len(xtest))
print(xtrain.shape, xtest.shape)
print(ytrain.shape, ytest.shape)

xtrain의 수:  124 xtest의 수:  54
(124, 13) (54, 13)
(124,) (54,)


In [None]:
# 의사결정나무
decision_tree = DecisionTreeClassifier(random_state=2201061230)
decision_tree.fit(xtrain, ytrain)
ypred = decision_tree.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.77      0.89      0.83        19
           1       0.79      0.68      0.73        22
           2       0.85      0.85      0.85        13

    accuracy                           0.80        54
   macro avg       0.80      0.81      0.80        54
weighted avg       0.80      0.80      0.79        54



In [None]:
# 랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier

xtrain, xtest, ytrain, ytest = train_test_split(data, label, test_size=0.2, random_state=2201061230)

random_forest = RandomForestClassifier(random_state=2201061230)
random_forest.fit(xtrain, ytrain)
ypred = random_forest.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       1.00      0.83      0.91        12
           1       0.87      0.93      0.90        14
           2       0.91      1.00      0.95        10

    accuracy                           0.92        36
   macro avg       0.93      0.92      0.92        36
weighted avg       0.92      0.92      0.92        36



In [None]:
# SVM 모델
from sklearn import svm
svm_model = svm.SVC()

svm_model.fit(xtrain, ytrain)
ypred = svm_model.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.90      0.75      0.82        12
           1       0.67      0.86      0.75        14
           2       0.50      0.40      0.44        10

    accuracy                           0.69        36
   macro avg       0.69      0.67      0.67        36
weighted avg       0.70      0.69      0.69        36



In [None]:
# SGD 모델
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()

sgd_model.fit(xtrain, ytrain)
ypred = sgd_model.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86        12
           1       0.52      1.00      0.68        14
           2       0.00      0.00      0.00        10

    accuracy                           0.64        36
   macro avg       0.51      0.58      0.51        36
weighted avg       0.53      0.64      0.55        36



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# 로지스틱 회귀
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()

logistic_model.fit(xtrain, ytrain)
ypred = logistic_model.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86        12
           1       0.81      0.93      0.87        14
           2       0.91      1.00      0.95        10

    accuracy                           0.89        36
   macro avg       0.91      0.89      0.89        36
weighted avg       0.90      0.89      0.89        36



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 따라서, SGD와 SVM 모델은 정확도가 다른 모델에 비해 낮음을 알 수 있음