In [65]:
import requests
import zipfile
import io
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler


# 1️⃣ ZIP 파일 다운로드 및 압축 해제
url = "https://github.com/MyungKyuYi/AI-class/raw/main/archive.zip"
response = requests.get(url)

if response.status_code == 200:
    zip_data = io.BytesIO(response.content)  # 메모리 버퍼에 저장
    with zipfile.ZipFile(zip_data, "r") as zip_ref:
        zip_ref.extractall("./unzipped_files")  # 압축 해제
    print("ZIP 파일 다운로드 및 압축 해제 완료!")
else:
    print("파일 다운로드 실패. 상태 코드:", response.status_code)

# 2️⃣ CSV 파일 로드 (해당 ZIP 파일 안의 파일명을 확인해야 함)
csv_path = "./unzipped_files/car_evaluation.csv"  # 압축 해제된 파일명을 지정
df = pd.read_csv(csv_path, header=None)

# 3️⃣ 데이터 확인
display(df)  # 데이터 일부 출력
print(df.columns)  # 컬럼명 확인

ZIP 파일 다운로드 및 압축 해제 완료!


Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


Index([0, 1, 2, 3, 4, 5, 6], dtype='int64')


In [66]:
df.columns=['price', 'maint', 'doors', 'persons', 'lug_capacity', 'safety',
       'output']
display(df)

Unnamed: 0,price,maint,doors,persons,lug_capacity,safety,output
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [67]:
df.isnull().sum() # 결측치 확인

price           0
maint           0
doors           0
persons         0
lug_capacity    0
safety          0
output          0
dtype: int64

In [68]:
# 2️⃣ 모든 문자형 컬럼을 Label Encoding
label_encoders = {}  # 각 컬럼별 인코더 저장
for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # 문자열을 숫자로 변환
    label_encoders[col] = le  # 변환기 저장 (필요하면 나중에 역변환 가능)

display(df)

Unnamed: 0,price,maint,doors,persons,lug_capacity,safety,output
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2
...,...,...,...,...,...,...,...
1723,1,1,3,2,1,2,1
1724,1,1,3,2,1,0,3
1725,1,1,3,2,0,1,2
1726,1,1,3,2,0,2,1


In [69]:
df['output'].value_counts() # 레이블 확인

output
2    1210
0     384
1      69
3      65
Name: count, dtype: int64

In [71]:
# 4️⃣ 마지막 열을 레이블(y), 나머지를 특징(X)으로 설정
X = df.drop(columns=["output"])  # 특징 데이터 (output 제외)
y = df["output"]  # 레이블 (output)

In [72]:
X

Unnamed: 0,price,maint,doors,persons,lug_capacity,safety
0,3,3,0,0,2,1
1,3,3,0,0,2,2
2,3,3,0,0,2,0
3,3,3,0,0,1,1
4,3,3,0,0,1,2
...,...,...,...,...,...,...
1723,1,1,3,2,1,2
1724,1,1,3,2,1,0
1725,1,1,3,2,0,1
1726,1,1,3,2,0,2


In [73]:
y

0       2
1       2
2       2
3       2
4       2
       ..
1723    1
1724    3
1725    2
1726    1
1727    3
Name: output, Length: 1728, dtype: int64

In [74]:
# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 5️⃣ 훈련 데이터와 테스트 데이터로 분할 (8:2 비율)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1382, 6) (1382,)
(346, 6) (346,)


In [75]:
# (1) 결정 트리(Decision Tree, DT)
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print (confusion_matrix(y_test, y_pred_dt))

Decision Tree Accuracy: 0.9739884393063584
[[ 76   6   1   0]
 [  1  10   0   0]
 [  0   0 235   0]
 [  1   0   0  16]]


In [76]:
# (2) 랜덤 포레스트(Random Forest, RF)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print (confusion_matrix(y_test, y_pred_rf))

Random Forest Accuracy: 0.976878612716763
[[ 76   7   0   0]
 [  0  11   0   0]
 [  0   0 235   0]
 [  1   0   0  16]]


In [77]:
# (3) 서포트 벡터 머신(Support Vector Machine, SVM)
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print (confusion_matrix(y_test, y_pred_svm))

SVM Accuracy: 0.6965317919075145
[[ 10   0  73   0]
 [  0   0  11   0]
 [  4   0 231   0]
 [  7   0  10   0]]


In [78]:
# (4) 로지스틱 회귀(Logistic Regression, LR)
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print (confusion_matrix(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.661849710982659
[[ 12   0  68   3]
 [  2   0   9   0]
 [ 17   0 217   1]
 [ 12   0   5   0]]
