In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# 데이터 불러우기(컬럼 이름 포함 되지 않았으므로 header=None)
df = pd.read_csv(r"C:\Users\tjdgu\Programing_Study\AI_Introduction\week3\car_evaluation.csv", header=None)

df

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [3]:
# 컬럼명이 따로 지정 X -> 컬럼명 지정
df.columns = ['price', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'acceptability']

df.columns

Index(['price', 'maint', 'doors', 'persons', 'lug_boot', 'safety',
       'acceptability'],
      dtype='object')

In [4]:
df

Unnamed: 0,price,maint,doors,persons,lug_boot,safety,acceptability
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [5]:
# 결측치 확인
df.isnull().sum()

price            0
maint            0
doors            0
persons          0
lug_boot         0
safety           0
acceptability    0
dtype: int64

In [6]:
# encoing할 컬럼 리스트
columns_to_encode = ['price', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'acceptability']

# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# 각 컬럼에 라벨 인코딩 적용
for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

df

Unnamed: 0,price,maint,doors,persons,lug_boot,safety,acceptability
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2
...,...,...,...,...,...,...,...
1723,1,1,3,2,1,2,1
1724,1,1,3,2,1,0,3
1725,1,1,3,2,0,1,2
1726,1,1,3,2,0,2,1


In [7]:
# 'acceptability' 컬럼을 제외한 데이터를 X로 설정
X = df.drop('acceptability', axis=1).values

# 'acceptability' 컬럼만 y로 설정
y = df['acceptability'].values

In [8]:
X

array([[3, 3, 0, 0, 2, 1],
       [3, 3, 0, 0, 2, 2],
       [3, 3, 0, 0, 2, 0],
       ...,
       [1, 1, 3, 2, 0, 1],
       [1, 1, 3, 2, 0, 2],
       [1, 1, 3, 2, 0, 0]])

In [9]:
y

array([2, 2, 2, ..., 2, 1, 3])

In [10]:
# 데이터 정규화
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 훈련, 테스트 데이터로 데이터 나누기 (8:2비율)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
# 훈련, 테스트 데이터 형태 확인 : (샘플개수, 특성개수)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1382, 6), (346, 6), (1382,), (346,))

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [13]:
# Decision Tree 분류
dt_model = DecisionTreeClassifier(random_state=0)
dt_model.fit(X_train, y_train)
dt_y_pred = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_y_pred)
dt_conf_matrix = confusion_matrix(y_test, dt_y_pred)

print(f"Decision Tree Accuracy: {dt_accuracy}")
print("Decision Tree Confusion Matrix:")
print(dt_conf_matrix)

Decision Tree Accuracy: 0.9682080924855492
Decision Tree Confusion Matrix:
[[ 71   4   4   0]
 [  0  17   0   0]
 [  2   0 238   0]
 [  1   0   0   9]]


In [14]:
# Random Forest 분류
rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_conf_matrix = confusion_matrix(y_test, rf_y_pred)

print(f"Random Forest Accuracy: {rf_accuracy}")
print("Random Forest Confusion Matrix:")
print(rf_conf_matrix)

Random Forest Accuracy: 0.9653179190751445
Random Forest Confusion Matrix:
[[ 73   4   1   1]
 [  1  13   0   3]
 [  1   0 239   0]
 [  1   0   0   9]]


In [15]:
# SVM 분류
svm_model = SVC(random_state=0)
svm_model.fit(X_train, y_train)
svm_y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_conf_matrix = confusion_matrix(y_test, svm_y_pred)

print(f"SVM Accuracy: {svm_accuracy}")
print("SVM Confusion Matrix:")
print(svm_conf_matrix)

SVM Accuracy: 0.9132947976878613
SVM Confusion Matrix:
[[ 73   1   4   1]
 [  7   8   0   2]
 [ 12   0 228   0]
 [  3   0   0   7]]


In [16]:
# Logistic Regression 분류

lr_model = LogisticRegression(random_state=0, max_iter=1000)
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_y_pred)
lr_conf_matrix = confusion_matrix(y_test, lr_y_pred)

print(f"Logistic Regression Accuracy: {lr_accuracy}")
print("Logistic Regression Confusion Matrix:")
print(lr_conf_matrix)

Logistic Regression Accuracy: 0.6502890173410405
Logistic Regression Confusion Matrix:
[[  9   0  67   3]
 [  1   0  16   0]
 [ 22   0 214   4]
 [  2   0   6   2]]
