#### 머신러닝 classification 순서 ####
데이터 불러오기 -> 데이터전처리(결측치, 불필요 열 제거 / 레이블 분포 확인 / 엔코딩 / 레이블 분포 확인 / 데이터 X, y로 나누기)
-> 데이터분할 -> 훈련 데이터 정규화 -> 모델 선택 및 학습 -> 모델 예측 -> 모델 평가

In [1]:
# 모듈 import

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# 데이터 불러오기

file_path = r"C:\Users\tjdgu\CodingFiles\AI_Introduction\week2\week2_assignment\titanic.csv"
df = pd.read_csv(file_path)

df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# 분석에 필요 없는 컬럼 제거
columns_to_delete = ['PassengerId', 'Name', 'Ticket', 'Cabin']  # 불필요 컬럼
df = df.drop(columns=columns_to_delete)

df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [4]:
# 결측치 확인
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [None]:
# Age 컬럼의 평균값 계산 (결측치를 제외한 값으로 계산)
mean_age = df['Age'].mean()

# Age 컬럼의 결측치를 평균값으로 채우기
df['Age'] = df['Age'].fillna(mean_age)

# Embarked 컬럼의 최빈값 구하기
most_common_embarked = df['Embarked'].mode()[0]

# Embarked 컬럼의 결측치를 최빈값으로 채우기
df['Embarked'] = df['Embarked'].fillna(most_common_embarked)

# 결측치 재확인
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [6]:
# 레이블 분포 확인
df['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [7]:
# 데이터 encoding
label_encoder = LabelEncoder()

# 특정 컬럼만 인코딩
columns_to_encode = ['Sex', 'Embarked']  # 인코딩할 컬럼 리스트

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.000000,1,0,7.2500,2
1,1,1,0,38.000000,1,0,71.2833,0
2,1,3,0,26.000000,0,0,7.9250,2
3,1,1,0,35.000000,1,0,53.1000,2
4,0,3,1,35.000000,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,2
887,1,1,0,19.000000,0,0,30.0000,2
888,0,3,0,29.699118,1,2,23.4500,2
889,1,1,1,26.000000,0,0,30.0000,0


In [8]:
# 'Survived' 컬럼을 제외한 데이터를 X로 설정
X = df.drop('Survived', axis=1)

# 'Survived' 컬럼만 y로 설정
y = df['Survived']

In [9]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [10]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [11]:
# 학습 데이터(train)와 테스트 데이터(test) 8:2 비율로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
# 훈련, 테스트 데이터 형태 확인
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

In [13]:
# Decision Tree 분류
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_y_pred = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_y_pred)
dt_conf_matrix = confusion_matrix(y_test, dt_y_pred)

print(f"Decision Tree Accuracy: {dt_accuracy}")
print("Decision Tree Confusion Matrix:")
print(dt_conf_matrix)

Decision Tree Accuracy: 0.7877094972067039
Decision Tree Confusion Matrix:
[[84 21]
 [17 57]]


In [14]:
# Random Forest 분류
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_conf_matrix = confusion_matrix(y_test, rf_y_pred)

print(f"Random Forest Accuracy: {rf_accuracy}")
print("Random Forest Confusion Matrix:")
print(rf_conf_matrix)

Random Forest Accuracy: 0.8156424581005587
Random Forest Confusion Matrix:
[[91 14]
 [19 55]]


In [15]:
# SVM 분류
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)
svm_y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_conf_matrix = confusion_matrix(y_test, svm_y_pred)

print(f"SVM Accuracy: {svm_accuracy}")
print("SVM Confusion Matrix:")
print(svm_conf_matrix)

SVM Accuracy: 0.8156424581005587
SVM Confusion Matrix:
[[93 12]
 [21 53]]


In [16]:
# Logistic Regression 분류

lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_y_pred)
lr_conf_matrix = confusion_matrix(y_test, lr_y_pred)

print(f"Logistic Regression Accuracy: {lr_accuracy}")
print("Logistic Regression Confusion Matrix:")
print(lr_conf_matrix)

Logistic Regression Accuracy: 0.8100558659217877
Logistic Regression Confusion Matrix:
[[90 15]
 [19 55]]


In [17]:
# KNN 분류

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_y_pred = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_y_pred)
knn_conf_matrix = confusion_matrix(y_test, knn_y_pred)

print(f"KNN Accuracy: {knn_accuracy}")
print("KNN Confusion Matrix:")
print(knn_conf_matrix)

KNN Accuracy: 0.7988826815642458
KNN Confusion Matrix:
[[90 15]
 [21 53]]
