## wine 예제

### import

In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

### CSV 파일 로딩하기

In [13]:
file_path = './wine.csv'
df = pd.read_csv(file_path)

df

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [20]:
### 결측치 확인

print("shape: ",df.shape)
print("columns: ", df.columns)
print("=== 결측치 현황 ===")
print(df.isnull().sum())

shape:  (178, 14)
columns:  Index(['Wine', 'Alcohol', 'Malic.acid', 'Ash', 'Acl', 'Mg', 'Phenols',
       'Flavanoids', 'Nonflavanoid.phenols', 'Proanth', 'Color.int', 'Hue',
       'OD', 'Proline'],
      dtype='object')
=== 결측치 현황 ===
Wine                    0
Alcohol                 0
Malic.acid              0
Ash                     0
Acl                     0
Mg                      0
Phenols                 0
Flavanoids              0
Nonflavanoid.phenols    0
Proanth                 0
Color.int               0
Hue                     0
OD                      0
Proline                 0
dtype: int64


### 레이블 분포 확인

In [24]:
print("\n=== 레이블 분포 ===")
print(df['Wine'].value_counts())


=== 레이블 분포 ===
Wine
2    71
1    59
3    48
Name: count, dtype: int64


### 레이블 인코딩 
wine 데이터에는 범주형 데이터가 없기 때문에 인코딩이 필요치 않지만 한번 해봄

In [51]:
label_encoder = LabelEncoder()

print("\n=== Alcohol 컬럼 인코딩 전 ===")
print(df['Alcohol'].value_counts())

columns_to_encode = ['Alcohol']

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])


print("\n=== Alcohol 컬럼 인코딩 후 ===")
print(df['Alcohol'].value_counts())



=== Alcohol 컬럼 인코딩 전 ===
Alcohol
59     6
31     6
20     5
27     4
32     3
      ..
93     1
70     1
95     1
98     1
112    1
Name: count, Length: 126, dtype: int64

=== Alcohol 컬럼 인코딩 후 ===
Alcohol
59     6
31     6
20     5
27     4
32     3
      ..
93     1
70     1
95     1
98     1
112    1
Name: count, Length: 126, dtype: int64


### 특성과 레이블 분리

In [53]:
X = df.drop('Wine', axis=1)
y = df['Wine']
# 훈련 및 테스트 세트 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### X_train, X_test, y_train, y_test의 shape 확인

In [56]:
print("\n=== 데이터셋 Shape ===")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


=== 데이터셋 Shape ===
X_train shape: (142, 13)
X_test shape: (36, 13)
y_train shape: (142,)
y_test shape: (36,)


### 5가지 분류 (RF, DT, LR, KNN, SVM) 하고, accuracy 및 confusion matrix 확인

In [None]:
# 분류기 초기화
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC()
}

# 모델 학습 및 평가
results = []
for name, clf in classifiers.items():
    clf.fit(X_train, y_train) # 모델 학습
    y_pred = clf.predict(X_test) # 테스트 데이터 예측
    acc = accuracy_score(y_test, y_pred) # 정확도 계산
    cm = confusion_matrix(y_test, y_pred) # 혼동 행렬 계산 (TP|FN|FP|TN)
    cr = classification_report(y_test, y_pred, zero_division=0) # 분류 보고서

    # 성능평가에서 average='micro'를 설정한 이유: wine의 타겟변수 y는 클래스가 3개이기 때문에
    # 기본 설정된 이진탐색(binary)으로 하면 오류발생, 전체 샘플을 기준으로 탐색하는'micro'로 변경
    f1 = f1_score(y_test, y_pred, average='micro') # f1값 계산 
    ps = precision_score(y_test, y_pred, average='micro') # 정확도 계산 
    rs = recall_score(y_test, y_pred, average='micro') # 재현율 계산
    results.append((name, acc, cm, cr, f1, ps, rs)) # 결과를 리스트에 저장

# 결과 출력
print("\n=== 모델 성능 비교 ===")
for name, acc, cm, cr, f1, ps, rs in results:
    print(f"\n======={name}=======")
    print(f"Accuracy: {acc:.4f}")
    print(f"f1_score: {f1:.4f}")
    print(f"precision_score: {ps:.4f}")
    print(f"recall_score: {rs:.4f}")
    print(cr)
    


=== 모델 성능 비교 ===

Accuracy: 1.0000
f1_score: 1.0000
precision_score: 1.0000
recall_score: 1.0000
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36


Accuracy: 0.9444
f1_score: 0.9444
precision_score: 0.9444
recall_score: 0.9444
              precision    recall  f1-score   support

           1       0.93      0.93      0.93        14
           2       0.93      1.00      0.97        14
           3       1.00      0.88      0.93         8

    accuracy                           0.94        36
   macro avg       0.95      0.93      0.94        36
weighted avg       0.95      0.94      0.94        36


Accuracy: 1.0000
f1_score: 1.0000
precision_score: 1.0000
r

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [90]:
for name, acc, cm, cr, f1, ps, rs in results:
    print(f"\n======={name}=======")
    print("Confusion Matrix:")
    print(cm)


Confusion Matrix:
[[14  0  0]
 [ 0 14  0]
 [ 0  0  8]]

Confusion Matrix:
[[13  1  0]
 [ 0 14  0]
 [ 1  0  7]]

Confusion Matrix:
[[14  0  0]
 [ 0 14  0]
 [ 0  0  8]]

Confusion Matrix:
[[14  0  0]
 [ 0 13  1]
 [ 1  1  6]]

Confusion Matrix:
[[14  0  0]
 [ 0 11  3]
 [ 0  3  5]]
