## ※ Q. 타이타닉 생존자 예측 데이터 세트 train.csv에 대하여 다음 사항을 수행하세요.
- 일괄 전처리 사용자 함수 transform_features(df) 작성
- dt, lr, rf 모델링 및 평가(정확도)
- dt_clf , folds=5 적용하여 KFold 교차검증 수행
- dt_clf , cv=5 적용, cross_val_score를 이용하여 교차검증 수행
- GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행.
  - parameters = {'max_depth':[2,3,5,10], 'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]}
  - dt_clf, scoring='accuracy', cv=5 적용




In [3]:
titanic_df = pd.read_csv('/content/drive/MyDrive/welcome_to_my_hell/m3통계/머신러닝/train.csv')

titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 데이터 불러오기
titanic_df = pd.read_csv('/content/drive/MyDrive/welcome_to_my_hell/m3통계/머신러닝/train.csv')



# 일괄 전처리 함수 정의
def transform_features(df):
    # 문자열 데이터가 있는 열을 삭제하고 NaN 값을 평균값으로 대체
    df = df.drop(columns=['Name', 'Cabin', 'Ticket', 'Embarked'])

    # Sex 열을 숫자로 매핑 (남자: 0, 여자: 1)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

    # 숫자형 데이터의 NaN 값을 평균값으로 대체
    df_numeric = df.select_dtypes(exclude='object')
    df[df_numeric.columns] = df_numeric.fillna(df_numeric.mean())

    return df

# Decision Tree, Logistic Regression, Random Forest 모델링 및 평가
def modeling_eval(model, train_features, train_target, test_features, test_target):
    # 모델 생성
    model.fit(train_features, train_target)

    # 예측 수행
    pred = model.predict(test_features)

    # 정확도 평가
    accuracy = accuracy_score(test_target, pred)
    print(f'Model 정확도: {accuracy:.4f}')

# 데이터 전처리 수행
titanic_df = transform_features(titanic_df)

# 피처 데이터 세트와 레이블 데이터 세트 분리
X_features = titanic_df.drop('Survived', axis=1)
y_target = titanic_df['Survived']

# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=1)

# Decision Tree 모델링 및 평가
dt_clf = DecisionTreeClassifier(random_state=1)
print('Decision Tree 모델 평가:')
modeling_eval(dt_clf, X_train, y_train, X_test, y_test)

# Logistic Regression 모델링 및 평가
lr_clf = LogisticRegression(random_state=1, max_iter=1000)
print('\nLogistic Regression 모델 평가:')
modeling_eval(lr_clf, X_train, y_train, X_test, y_test)

# Random Forest 모델링 및 평가
rf_clf = RandomForestClassifier(random_state=1)
print('\nRandom Forest 모델 평가:')
modeling_eval(rf_clf, X_train, y_train, X_test, y_test)

# KFold 교차검증 수행
def kfold_cross_validation(model, features, target, folds=5):
    kfold = KFold(n_splits=folds)
    cv_accuracy = []

    for train_idx, test_idx in kfold.split(features):
        X_train, X_test = features.iloc[train_idx], features.iloc[test_idx]
        y_train, y_test = target.iloc[train_idx], target.iloc[test_idx]

        model.fit(X_train, y_train)
        pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, pred)
        cv_accuracy.append(accuracy)

    avg_accuracy = np.mean(cv_accuracy)
    print(f'{folds}-fold Cross Validation 정확도: {avg_accuracy:.4f}')

# KFold 교차검증 수행
print('\nDecision Tree KFold Cross Validation:')
kfold_cross_validation(dt_clf, X_features, y_target, folds=5)

# cross_val_score를 이용한 교차검증 수행
def cross_val_score_eval(model, features, target, cv=5):
    scores = cross_val_score(model, features, target, cv=cv)
    avg_accuracy = np.mean(scores)
    print(f'{cv}-fold cross_val_score 평균 정확도: {avg_accuracy:.4f}')

# cross_val_score를 이용한 교차검증 수행
print('\nDecision Tree cross_val_score:')
cross_val_score_eval(dt_clf, X_features, y_target, cv=5)

# GridSearchCV를 이용한 하이퍼파라미터 튜닝
parameters = {'max_depth': [2, 3, 5, 10], 'min_samples_split': [2, 3, 5], 'min_samples_leaf': [1, 5, 8]}
grid_dt = GridSearchCV(dt_clf, param_grid=parameters, scoring='accuracy', cv=5)
grid_dt.fit(X_train, y_train)

# GridSearchCV 최적 하이퍼파라미터로 학습된 Estimator로 예측 및 평가 수행
best_dt = grid_dt.best_estimator_
pred = best_dt.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print(f'GridSearchCV 최적 하이퍼파라미터로 학습된 Estimator 정확도: {accuracy:.4f}')


Decision Tree 모델 평가:
Model 정확도: 0.7542

Logistic Regression 모델 평가:
Model 정확도: 0.7877

Random Forest 모델 평가:
Model 정확도: 0.7765

Decision Tree KFold Cross Validation:
5-fold Cross Validation 정확도: 0.6936

Decision Tree cross_val_score:
5-fold cross_val_score 평균 정확도: 0.7611
GridSearchCV 최적 하이퍼파라미터로 학습된 Estimator 정확도: 0.8045
