In [None]:
# DT, RF, SVM, LR, K-NN 
# titanic.csv

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# 0. CSV 파일 불러오기
file = "C:/AI/titanic.csv"  # CSV 파일 경로
df = pd.read_csv(file)

# 1. 데이터 전처리 (X: 특성, y: 타겟)
X = df.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin', 'Embarked'])
y = df["Survived"]
print("나이의 평균값 : ", df['Age'].mean())
X['Age'] = df['Age'].fillna(df['Age'].mean())
X['Sex'] = df['Sex'].replace({'male': 1, 'female': 0})
print(X)

# 2. 데이터 분할 (훈련 80%, 테스트 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. 모델 학습
model_DT = DecisionTreeClassifier() 
model_RF = RandomForestClassifier()
model_SVM = SVC()
model_LR = LogisticRegression()

model_DT.fit(X_train, y_train)
model_RF.fit(X_train, y_train)
model_SVM.fit(X_train, y_train)
model_LR.fit(X_train, y_train)

# 5. 예측 및 정확도 평가
y_pred_DT = model_DT.predict(X_test)
accuracy_DT = accuracy_score(y_test, y_pred_DT)
print(f"1. Decision Tree Accuracy: {accuracy_DT:.4f}")

y_pred_RF = model_RF.predict(X_test)
accuracy_RF = accuracy_score(y_test, y_pred_RF)
print(f"2. Random Forest Accuracy: {accuracy_RF:.4f}")

y_pred_SVM = model_SVM.predict(X_test)
accuracy_SVM = accuracy_score(y_test, y_pred_SVM)
print(f"3. Support Vector Machine Accuracy: {accuracy_SVM:.4f}")

y_pred_LR = model_LR.predict(X_test)
accuracy_LR = accuracy_score(y_test, y_pred_LR)
print(f"4. Logistic Regression Accuracy: {accuracy_LR:.4f}")

# 5. K-NN 모델 학습 (K=2)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
y_pred_class = knn_clf.predict(X_test)
print("K-NN Classification 예측 결과:", y_pred_class)

accuracy = knn_clf.score(X_test, y_test)
print(f"5. K-NN Classification 정확도: {accuracy:.2f}")

k_values = range(1, 21)  # K값 범위 설정
scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)  # K-NN 모델 생성
    knn.fit(X_train, y_train)  # 학습
    y_pred = knn.predict(X_test)  # 예측
    accuracy = accuracy_score(y_test, y_pred)  # 정확도 계산
    scores.append(accuracy)

# 최적의 K 값 찾기
best_k = k_values[np.argmax(scores)]
print(f"최적의 K 값: {best_k}")

나이의 평균값 :  29.69911764705882
     PassengerId  Pclass  Sex        Age  SibSp  Parch     Fare
0              1       3    1  22.000000      1      0   7.2500
1              2       1    0  38.000000      1      0  71.2833
2              3       3    0  26.000000      0      0   7.9250
3              4       1    0  35.000000      1      0  53.1000
4              5       3    1  35.000000      0      0   8.0500
..           ...     ...  ...        ...    ...    ...      ...
886          887       2    1  27.000000      0      0  13.0000
887          888       1    0  19.000000      0      0  30.0000
888          889       3    0  29.699118      1      2  23.4500
889          890       1    1  26.000000      0      0  30.0000
890          891       3    1  32.000000      0      0   7.7500

[891 rows x 7 columns]


  X['Sex'] = df['Sex'].replace({'male': 1, 'female': 0})
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


1. Decision Tree Accuracy: 0.7039
2. Random Forest Accuracy: 0.7821
3. Support Vector Machine Accuracy: 0.5978
4. Logistic Regression Accuracy: 0.7765
K-NN Classification 예측 결과: [0 0 0 1 0 1 1 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0
 1 1 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1 1 0 1
 0 0 1 1 1 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 1
 0 1 0 0 0 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0
 1 0 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0 1 0]
5. K-NN Classification 정확도: 0.80
최적의 K 값: 1
