In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix



In [71]:
# 1. 데이터 로딩
file_path = "titanic.csv"  
df = pd.read_csv(file_path)

In [72]:
# 2. 결측치 확인
print("결측치 확인:")
print(df.isnull().sum())

결측치 확인:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [73]:
# 3. 결측치 처리 (Age는 평균으로, Cabin과 Embarked는 제거)
df['Age'] = df['Age'].fillna(df['Age'].mean())  # inplace=True 제거

df = df.drop(columns=['Cabin', 'Embarked'], errors='ignore')  # inplace=True 제거


In [74]:
# 4. 레이블 확인 (imbalanced data)
print("\n레이블 확인 (변환 전):")
print(df['Survived'].value_counts())



레이블 확인 (변환 전):
Survived
0    549
1    342
Name: count, dtype: int64


In [75]:
# 5. 불필요한 컬럼 제거
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True, errors='ignore')

In [76]:
# 6. 엔코딩 (숫자로 변환)
label_enc = LabelEncoder()
df['Sex'] = label_enc.fit_transform(df['Sex'])


In [77]:
# 변환 후 레이블 확인
print("\n레이블 확인 (변환 후):")
print(df['Survived'].value_counts())

# 데이터 분할
X = df.drop(columns=['Survived'])
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


레이블 확인 (변환 후):
Survived
0    549
1    342
Name: count, dtype: int64


In [78]:
# 모델 정의
models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC()
}


In [79]:
# 모델 학습 및 평가
results = []
conf_matrices = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    results.append({"Model": name, "Accuracy": acc})
    conf_matrices[name] = conf_matrix


In [80]:
# 결과 출력
results_df = pd.DataFrame(results)
print("\n모델 정확도 결과:")
print(results_df)


모델 정확도 결과:
                 Model  Accuracy
0        Random Forest  0.787709
1        Decision Tree  0.754190
2  Logistic Regression  0.810056
3                  KNN  0.698324
4                  SVM  0.653631
