## car_evaluation 예제

### import

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

### CSV 파일 로딩

In [12]:
file_path = './car_evaluation.csv'
df = pd.read_csv(file_path)

df

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [16]:
print("=== 결측치 현황 ===")
print(df.isnull().sum())

=== 결측치 현황 ===
vhigh      0
vhigh.1    0
2          0
2.1        0
small      0
low        0
unacc      0
dtype: int64


In [24]:
print("shape: ", df.shape)
print("columns: ", df.columns)

shape:  (1727, 7)
columns:  Index(['vhigh', 'vhigh.1', '2', '2.1', 'small', 'low', 'unacc'], dtype='object')


In [None]:
print("\n=== 레이블 분포 ===")
print(df['unacc'].value_counts())


=== 레이블 분포 ===
unacc
unacc    1209
acc       384
good       69
vgood      65
Name: count, dtype: int64


### 레이블 인코딩

범주형 데이터를 '정수'로 변환

In [27]:
label_encoder = LabelEncoder()

# 인코딩할 컬럼 리스트
columns_to_encode = df.copy()

print("\n=== unacc 컬럼 인코딩 전 ===")
print(df['unacc'].value_counts())

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column]) 

print("\n=== unacc 컬럼 인코딩 후 ===")
print(df['unacc'].value_counts())


=== unacc 컬럼 인코딩 전 ===
unacc
unacc    1209
acc       384
good       69
vgood      65
Name: count, dtype: int64

=== unacc 컬럼 인코딩 후 ===
unacc
2    1209
0     384
1      69
3      65
Name: count, dtype: int64


### 특성과 레이블 분리

In [33]:
X = df.drop('unacc', axis=1)
y = df['unacc'] # 레이블
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### X_train, X_test, y_train, y_test의 shape 확인

In [61]:
print("\n=== 데이터셋 Shape ===")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

print("\n=== 실제 레이블 분포 (y_test) ===")
print(pd.Series(y_test).value_counts())

print("\n=== 예측 레이블 분포 (y_pred) ===")
print(pd.Series(y_pred).value_counts())


=== 데이터셋 Shape ===
X_train shape: (1381, 6)
X_test shape: (346, 6)
y_train shape: (1381,)
y_test shape: (346,)

=== 실제 레이블 분포 (y_test) ===
unacc
2    237
0     77
3     17
1     15
Name: count, dtype: int64

=== 예측 레이블 분포 (y_pred) ===
2    252
0     78
3     14
1      2
Name: count, dtype: int64


### 5가지 분류 (RF, DT, LR, KNN, SVM) 하고, accuracy 및 confusion matrix 확인

In [63]:
# 분류기 초기화
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC()
}

# 모델 학습 및 평가
results = []
for name, clf in classifiers.items():
    clf.fit(X_train, y_train) # 모델 학습
    y_pred = clf.predict(X_test) # 테스트 데이터 예측
    acc = accuracy_score(y_test, y_pred) # 정확도 계산
    cm = confusion_matrix(y_test, y_pred) # 혼동 행렬 계산 (TP|FN|FP|TN)
    cr = classification_report(y_test, y_pred) # 분류 보고서
    results.append((name, acc, cm, cr)) # 결과를 리스트에 저장

# 결과 출력
print("=== 모델 성능 비교 ===")
for name, acc, cm, cr in results:
    print(f"\n======={name}=======")
    print(f"Accuracy: {acc:.4f}")
    print(cr)
    print("Confusion Matrix:")
    print(cm)

=== 모델 성능 비교 ===

Accuracy: 0.9653
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        77
           1       0.91      0.67      0.77        15
           2       0.99      1.00      0.99       237
           3       0.75      0.88      0.81        17

    accuracy                           0.97       346
   macro avg       0.90      0.87      0.88       346
weighted avg       0.97      0.97      0.96       346

Confusion Matrix:
[[ 73   1   2   1]
 [  1  10   0   4]
 [  1   0 236   0]
 [  2   0   0  15]]

Accuracy: 0.9711
              precision    recall  f1-score   support

           0       0.96      0.95      0.95        77
           1       0.75      0.80      0.77        15
           2       1.00      1.00      1.00       237
           3       0.83      0.88      0.86        17

    accuracy                           0.97       346
   macro avg       0.89      0.91      0.90       346
weighted avg       0.97      0.97    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
