In [None]:

import pandas as pd
import numpy as np

# 데이터 불러오기
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# 데이터 크기 확인
print(train_df.shape)
print(test_df.shape)

# 출력 설정 조정
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

# 상위 데이터 확인
train_df.head(10)
test_df.head(10)

# 데이터 타입 확인
train_df.dtypes

# 기초 통계 확인
train_df.describe()
test_df.describe()

# 범주형 변수 값 확인
train_df["Sex"].value_counts()
train_df["Embarked"].value_counts()
train_df["Cabin"].value_counts()

# 결측값 확인
train_df.isnull().sum()
test_df.isnull().sum()

import matplotlib.pyplot as plt
import seaborn as sns

# 시각화 스타일 설정
plt.style.use("ggplot")

# Embarked와 생존자 분석
embarked_df = train_df[["Embarked", "Survived", "PassengerId"]].dropna().groupby(["Embarked", "Survived"]).count().unstack()
embarked_df.plot.bar(stacked=True)

# 생존률 계산
embarked_df["survived_rate"] = embarked_df.iloc[:,0] / (embarked_df.iloc[:,0] + embarked_df.iloc[:,1])

# 성별 생존자 분석
sex_df = train_df[["Sex", "Survived", "PassengerId"]].dropna().groupby(["Sex", "Survived"]).count().unstack()
sex_df.plot.bar(stacked=True)

# 나이별 생존자 분석
age_df = train_df[["Age", "Survived", "PassengerId"]].dropna().groupby(["Age", "Survived"]).count().unstack()
age_df.plot.bar(stacked=True)

# 나이 분포 시각화
plt.hist(x=[train_df.Age[train_df.Survived == 0], train_df.Age[train_df.Survived==1]], bins=8, histtype='barstacked', label=["Death", "Survived"])
plt.legend()

# 더미 변수 생성
train_df_corr = pd.get_dummies(train_df, columns=["Sex"], drop_first=True)
train_df_corr = pd.get_dummies(train_df_corr, columns=["Embarked"])

# 상관관계 분석
train_corr = train_df_corr.select_dtypes(include=["number", "bool"]).corr()
plt.figure(figsize=(9, 9))
sns.heatmap(train_corr, vmax=1, vmin=-1, center=0, annot=True)

# 데이터 통합
all_df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True)
all_df.isnull().sum()

# Fare 결측값 처리
Fare_mean = all_df[["Pclass", "Fare"]].groupby("Pclass").mean().reset_index()
Fare_mean.columns = ["Pclass", "Fare_mean"]
all_df = pd.merge(all_df, Fare_mean, on="Pclass", how="left")
all_df.loc[(all_df["Fare"].isnull()), "Fare"] = all_df["Fare_mean"]
all_df = all_df.drop("Fare_mean", axis=1)

# 이름에서 호칭 추출
name_df = all_df["Name"].str.split("[,.]", n = 2, expand=True)
name_df.columns = ["family_name", "honorific", "name"]
name_df = name_df.apply(lambda col: col.str.strip())

# 호칭 병합
all_df = pd.concat([all_df, name_df], axis=1)

# 호칭별 나이 박스플롯
plt.figure(figsize=(18, 5))
sns.boxplot(x="honorific", y="Age", data=all_df)

# 훈련/테스트 분리
train_df = pd.concat([train_df, name_df[0:len(train_df)].reset_index(drop=True)], axis=1)
test_df = pd.concat([test_df, name_df[len(train_df):]])

# 호칭별 생존자 분석
honorific_df = train_df[["honorific", "Survived", "PassengerId"]].dropna().groupby(["honorific", "Survived"]).count().unstack()
honorific_df.plot.bar(stacked=True)

# 평균 나이로 결측값 채우기
honorific_age_mean = all_df[["honorific", "Age"]].groupby("honorific").mean().reset_index()
honorific_age_mean.columns = ["honorific", "honorific_Age"]
all_df = pd.merge(all_df, honorific_age_mean, on="honorific", how="left")
all_df.loc[all_df["Age"].isnull(), "Age"] = all_df["honorific_Age"]
all_df = all_df.drop(["honorific_Age"], axis=1)

# 불필요한 열 제거
all_df = all_df.drop(["PassengerId", "Name", "family_name", "name", "Ticket", "Cabin"], axis=1)

# 문자형 변수 확인
categories = all_df.columns[all_df.dtypes=="object"]

# 희귀 호칭 통합
all_df.loc[~((all_df["honorific"]=="Mr") | (all_df["honorific"]=="Miss") | (all_df["honorific"]=="Master")), "honorific"] = "other"

# 결측값 처리
all_df["Embarked"].fillna("missing", inplace=True)

# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
for cat in categories:
    le = LabelEncoder()
    if all_df[cat].dtypes == "object":
        all_df[cat] = le.fit_transform(all_df[cat])

# 학습/예측 데이터 분리
train_X = all_df[~all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)
train_Y = train_df["Survived"]
test_X = all_df[all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)

# 모델 학습 (LightGBM)
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score

X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_Y, test_size=0.2)
categories = ["Embarked", "Pclass", "Sex", "honorific"]
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categories)
lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature=categories, reference=lgb_train)
lgbm_params = {"objective":"binary", "random_seed":1234}

model_lgb = lgb.train(lgbm_params, lgb_train, num_boost_round=100, valid_sets=[lgb_eval],
                      callbacks=[lgb.early_stopping(20), lgb.log_evaluation(10)])

# 중요도 시각화
importance = pd.DataFrame(model_lgb.feature_importance(), index=X_train.columns, columns=["importance"]).sort_values(by="importance", ascending=True)
importance.plot.barh()

# 예측 및 정확도
y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
accuracy_score(y_valid, np.round(y_pred))

# 교차검증
folds = 3
kf = KFold(n_splits=folds)
accuracy_list = []
models = []

for train_index, val_index in kf.split(train_X):
    X_train = train_X.iloc[train_index]
    X_valid = train_X.iloc[val_index]
    y_train = train_Y.iloc[train_index]
    y_valid = train_Y.iloc[val_index]
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categories)
    lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature=categories, reference=lgb_train)
    model_lgb = lgb.train(lgbm_params, lgb_train, num_boost_round=100, valid_sets=[lgb_eval],
                          callbacks=[lgb.early_stopping(20), lgb.log_evaluation(10)])
    y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
    acc = accuracy_score(y_valid, np.round(y_pred))
    print("Fold accuracy:", acc)
    accuracy_list.append(acc)
    models.append(model_lgb)

# 평균 정확도
mean_accuracy = np.mean(accuracy_list)
print("Average Cross-Validation Accuracy:", mean_accuracy)

# 테스트 예측 평균 후 제출
preds = [model.predict(test_X) for model in models]
preds_mean = np.mean(np.array(preds), axis=0)
preds_int = (preds_mean > 0.5).astype(int)
submission["Survived"] = preds_int
submission.to_csv("titanic_submit01.csv", index=False)
