#### 머신러닝 classification 순서 ####
데이터 불러오기 -> 데이터전처리(결측치, 불필요 열 제거 / 레이블 분포 확인 / 엔코딩 / 레이블 분포 확인 / 데이터 X, y로 나누기)
-> 데이터분할 -> 훈련 데이터 정규화 -> 모델 선택 및 학습 -> 모델 예측 -> 모델 평가

In [1]:
# 모듈 import

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# 데이터 불러오기

file_path = r"C:\Users\tjdgu\CodingFiles\AI_Introduction\week1\iris.csv"
df = pd.read_csv(file_path)

df

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
# 결측치 확인
df.isnull().sum()

SepalLength    0
SepalWidth     0
PetalLength    0
PetalWidth     0
Name           0
dtype: int64

In [4]:
# 레이블 분포 확인 (엔코딩 전)
df['Name'].value_counts()

Name
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [5]:
# 데이터 encoding
label_encoder = LabelEncoder()

# 특정 컬럼만 인코딩
columns_to_encode = ['Name']  # 인코딩할 컬럼 리스트

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

In [6]:
# 레이블 분포 확인 (엔코딩 후)
df['Name'].value_counts()

Name
0    50
1    50
2    50
Name: count, dtype: int64

In [7]:
# 'Name' 컬럼을 제외한 데이터를 X로 설정
X = df.drop('Name', axis=1)

# 'Name' 컬럼만 y로 설정
y = df['Name']

In [8]:
X.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [9]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Name, dtype: int32

In [10]:
# 학습 데이터(train)와 테스트 데이터(test) 5:5 비율로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# 훈련, 테스트 데이터 형태 확인
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((75, 4), (75, 4), (75,), (75,))

In [12]:
# Decision Tree 분류
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_y_pred = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_y_pred)
dt_conf_matrix = confusion_matrix(y_test, dt_y_pred)

print(f"Decision Tree Accuracy: {dt_accuracy}")
print("Decision Tree Confusion Matrix:")
print(dt_conf_matrix)

Decision Tree Accuracy: 0.9066666666666666
Decision Tree Confusion Matrix:
[[29  0  0]
 [ 0 20  3]
 [ 0  4 19]]


In [13]:
# Random Forest 분류
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_conf_matrix = confusion_matrix(y_test, rf_y_pred)

print(f"Random Forest Accuracy: {rf_accuracy}")
print("Random Forest Confusion Matrix:")
print(rf_conf_matrix)

Random Forest Accuracy: 0.9866666666666667
Random Forest Confusion Matrix:
[[29  0  0]
 [ 0 23  0]
 [ 0  1 22]]


In [14]:
# SVM 분류
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)
svm_y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_conf_matrix = confusion_matrix(y_test, svm_y_pred)

print(f"SVM Accuracy: {svm_accuracy}")
print("SVM Confusion Matrix:")
print(svm_conf_matrix)

SVM Accuracy: 0.9866666666666667
SVM Confusion Matrix:
[[29  0  0]
 [ 0 23  0]
 [ 0  1 22]]


In [15]:
# Logistic Regression 분류

lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_y_pred)
lr_conf_matrix = confusion_matrix(y_test, lr_y_pred)

print(f"Logistic Regression Accuracy: {lr_accuracy}")
print("Logistic Regression Confusion Matrix:")
print(lr_conf_matrix)

Logistic Regression Accuracy: 0.9866666666666667
Logistic Regression Confusion Matrix:
[[29  0  0]
 [ 0 23  0]
 [ 0  1 22]]
