#### 머신러닝 classification 순서 ####
데이터 불러오기 -> 데이터전처리(결측치, 불필요 열 제거 / 레이블 분포 확인 / 엔코딩 / 레이블 분포 확인 / 데이터 X, y로 나누기)
-> 데이터분할 -> 훈련 데이터 정규화 -> 모델 선택 및 학습 -> 모델 예측 -> 모델 평가

In [1]:
# 모듈 import

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# 데이터 불러오기

file_path = r"C:\Users\tjdgu\CodingFiles\AI_Introduction\week3\car_evaluation.csv"
df = pd.read_csv(file_path, header=None)    # 컬럼이름 X -> header=None

df

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [3]:
# 컬럼명이 따로 지정 X -> 컬럼명 지정
df.columns = ['price', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'acceptability']

df.columns

Index(['price', 'maint', 'doors', 'persons', 'lug_boot', 'safety',
       'acceptability'],
      dtype='object')

In [4]:
df

Unnamed: 0,price,maint,doors,persons,lug_boot,safety,acceptability
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [5]:
# 결측치 확인
df.isnull().sum()

price            0
maint            0
doors            0
persons          0
lug_boot         0
safety           0
acceptability    0
dtype: int64

In [6]:
# 레이블 분포 확인(encoding 전)
df['acceptability'].value_counts()

acceptability
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [7]:
# 데이터 encoding
label_encoder = LabelEncoder()

# 특정 컬럼만 인코딩
columns_to_encode = ['price', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'acceptability']  # 인코딩할 컬럼 리스트

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

df

Unnamed: 0,price,maint,doors,persons,lug_boot,safety,acceptability
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2
...,...,...,...,...,...,...,...
1723,1,1,3,2,1,2,1
1724,1,1,3,2,1,0,3
1725,1,1,3,2,0,1,2
1726,1,1,3,2,0,2,1


In [8]:
# 레이블 분포 확인(encoding 후)
df['acceptability'].value_counts()

acceptability
2    1210
0     384
1      69
3      65
Name: count, dtype: int64

In [9]:
# 'acceptability' 컬럼을 제외한 데이터를 X로 설정
X = df.drop('acceptability', axis=1)

# 'acceptability' 컬럼만 y로 설정
y = df['acceptability']

In [10]:
X.head()

Unnamed: 0,price,maint,doors,persons,lug_boot,safety
0,3,3,0,0,2,1
1,3,3,0,0,2,2
2,3,3,0,0,2,0
3,3,3,0,0,1,1
4,3,3,0,0,1,2


In [11]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: acceptability, dtype: int32

In [12]:
# 학습 데이터(train)와 테스트 데이터(test) 8:2 비율로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
# 훈련, 테스트 데이터 형태 확인
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1382, 6), (346, 6), (1382,), (346,))

In [14]:
# Decision Tree 분류
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_y_pred = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_y_pred)
dt_conf_matrix = confusion_matrix(y_test, dt_y_pred)

print(f"Decision Tree Accuracy: {dt_accuracy}")
print("Decision Tree Confusion Matrix:")
print(dt_conf_matrix)

Decision Tree Accuracy: 0.9739884393063584
Decision Tree Confusion Matrix:
[[ 76   6   1   0]
 [  1  10   0   0]
 [  0   0 235   0]
 [  1   0   0  16]]


In [15]:
# Random Forest 분류
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_conf_matrix = confusion_matrix(y_test, rf_y_pred)

print(f"Random Forest Accuracy: {rf_accuracy}")
print("Random Forest Confusion Matrix:")
print(rf_conf_matrix)

Random Forest Accuracy: 0.9797687861271677
Random Forest Confusion Matrix:
[[ 77   6   0   0]
 [  0  11   0   0]
 [  0   0 235   0]
 [  1   0   0  16]]


In [16]:
# SVM 분류
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)
svm_y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_conf_matrix = confusion_matrix(y_test, svm_y_pred)

print(f"SVM Accuracy: {svm_accuracy}")
print("SVM Confusion Matrix:")
print(svm_conf_matrix)

SVM Accuracy: 0.9017341040462428
SVM Confusion Matrix:
[[ 68   5  10   0]
 [  6   4   0   1]
 [ 10   0 225   0]
 [  2   0   0  15]]


In [17]:
# Logistic Regression 분류

lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_y_pred)
lr_conf_matrix = confusion_matrix(y_test, lr_y_pred)

print(f"Logistic Regression Accuracy: {lr_accuracy}")
print("Logistic Regression Confusion Matrix:")
print(lr_conf_matrix)

Logistic Regression Accuracy: 0.6589595375722543
Logistic Regression Confusion Matrix:
[[ 11   0  69   3]
 [  2   0   9   0]
 [ 17   0 217   1]
 [ 12   0   5   0]]


In [18]:
# KNN 분류

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_y_pred = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_y_pred)
knn_conf_matrix = confusion_matrix(y_test, knn_y_pred)

print(f"KNN Accuracy: {knn_accuracy}")
print("KNN Confusion Matrix:")
print(knn_conf_matrix)

KNN Accuracy: 0.9421965317919075
KNN Confusion Matrix:
[[ 74   0   8   1]
 [  4   7   0   0]
 [  1   0 234   0]
 [  6   0   0  11]]
