In [1]:
# 모듈 import
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# 1. 데이터 로딩
df = pd.read_csv(r"C:\Users\tjdgu\Programing_Study\AI_Introduction\week2\week2_assignment\titanic.csv")

df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# 2. 결측치 확인
print("결측치 확인:")
print(df.isnull().sum())

결측치 확인:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [4]:
# 3. Age 컬럼의 결측치는 평균값으로 대체
mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_age)

df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


In [5]:
# 4. 레이블 분포 확인 (데이터 불균형 여부 체크)
print("'Survived' 레이블 분포 (인코딩 전):")
print(df['Survived'].value_counts())

'Survived' 레이블 분포 (인코딩 전):
Survived
0    549
1    342
Name: count, dtype: int64


In [6]:
# 5. 분석에 필요 없는 컬럼 제거
columns_to_delete = ['PassengerId', 'Name', 'Ticket', 'Cabin']  # 불필요 컬럼
df = df.drop(columns=columns_to_delete)

In [7]:
# 6. 범주형 변수 인코딩 (숫자 변환)
label_encoder = LabelEncoder()
columns_to_encode = ['Sex', 'Embarked']  # 인코딩할 컬럼

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column].astype(str))

# 인코딩 후 레이블 다시 확인
print("'Survived' 레이블 분포 (인코딩 후):")
print(df['Survived'].value_counts())

'Survived' 레이블 분포 (인코딩 후):
Survived
0    549
1    342
Name: count, dtype: int64


In [8]:
# 7. 학습 데이터(X), 타겟 레이블(y) 분리
X = df.drop(columns='Survived')
y = df['Survived']

In [9]:
# 8. 학습 데이터와 테스트 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
# 9. 분류 모델 정의 (총 5개)
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=200),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC()
}

In [11]:
# 10. 모델 학습 및 평가
for name, model in models.items():
    model.fit(X_train, y_train)  # 모델 학습
    y_pred = model.predict(X_test)  # 예측 결과
    acc = accuracy_score(y_test, y_pred)  # 정확도
    cm = confusion_matrix(y_test, y_pred)  # 혼동 행렬
    
    # 결과 출력
    print(f"[{name}]")
    print(f"정확도 (Accuracy): {acc}")
    print("Confusion Matrix :")
    print(cm)

[Random Forest]
정확도 (Accuracy): 0.8100558659217877
Confusion Matrix :
[[90 15]
 [19 55]]
[Decision Tree]
정확도 (Accuracy): 0.7821229050279329
Confusion Matrix :
[[86 19]
 [20 54]]
[Logistic Regression]
정확도 (Accuracy): 0.8100558659217877
Confusion Matrix :
[[90 15]
 [19 55]]
[K-Nearest Neighbors]
정확도 (Accuracy): 0.7206703910614525
Confusion Matrix :
[[87 18]
 [32 42]]
[Support Vector Machine]
정확도 (Accuracy): 0.659217877094972
Confusion Matrix :
[[99  6]
 [55 19]]
