In [None]:
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')
gender_df = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [None]:
print(train_df.columns.values)

In [None]:
train_df.head()

In [None]:
train_df.info()
print('-'*40)
test_df.info()

In [None]:
train_df.describe(include='all')

In [None]:
train_df.describe(include=['O'])

In [None]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["Sex", 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived',ascending=False)

In [None]:
train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
g = sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20)

In [None]:
train_df.hist(figsize=(14,14),bins=20)
plt.show()

In [None]:
sns.barplot(data=train_df,x='Sex',y="Survived",hue='Sex')

In [None]:
sns.barplot(x="Pclass",y="Survived",hue="Sex", data=train_df)

In [None]:
sns.violinplot(data=train_df,x='Pclass',y='Age',hue='Pclass')

In [None]:
correlation_matrix = train_df.corr(numeric_only=True)

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True,
cmap='coolwarm', fmt=".2f", linewidths=0.5)

plt.title('Correlation Heatmap of Titanic Dataset')
plt.show()

In [None]:
grid = sns.FacetGrid(train_df,row='Embarked',height=2.2,aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived','Sex', palette='deep')
grid.add_legend()

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
print('counts of missing value (train) =>', train_df['Embarked'].isnull().sum())
print('-'*40)
print(train_df['Embarked'].value_counts())
print('-'*40)
print('counts of missing value (test) =>', test_df['Embarked'].isnull().sum())
print('-'*40)
print(test_df['Embarked'].value_counts())

In [None]:
train_df['Embarked'] = train_df['Embarked'].fillna('S')
embarked_mapping = {'S':1,'C':2,'Q':3}
train_df['Embarked'] = train_df['Embarked'].map(embarked_mapping).astype(int)
test_df['Embarked'] = test_df['Embarked'].map(embarked_mapping).astype(int)

In [None]:
print('counts of missing value (train) =>', train_df['Embarked'].isnull().sum())
print('-'*40)
print(train_df['Embarked'].value_counts())
print('-'*40)
print('counts of missing value (test) =>', test_df['Embarked'].isnull().sum())
print('-'*40)
print(test_df['Embarked'].value_counts())

In [None]:
train_df['Title'] = train_df.Name.str.extract('([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df.Name.str.extract('([A-Za-z]+)\.', expand=False)
pd.crosstab(train_df['Title'], train_df['Sex'])

In [None]:
train_df['Title'] = train_df['Title'].replace(['Lady','Countess','Capt','Col','Don',\
                                               'Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
train_df['Title'] = train_df['Title'].replace(['Mlle'],'Miss')
train_df['Title'] = train_df['Title'].replace(['Ms'],'Miss')
train_df['Title'] = train_df['Title'].replace(['Mme'],'Mrs')

test_df['Title'] = test_df['Title'].replace(['Lady','Countess','Capt','Col','Don',\
                                             'Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
test_df['Title'] = test_df['Title'].replace(['Mlle'],'Miss')
test_df['Title'] = test_df['Title'].replace(['Ms'],'Miss')
test_df['Title'] = test_df['Title'].replace(['Mme'],'Mrs')

In [None]:
title_survived = train_df.groupby('Title', as_index=False)['Survived'].mean()
title_survived.plot('Title',kind='bar', ).set_xlabel('Title')

In [None]:
title_mapping={'Mr':1,'Miss':2,'Mrs':3,'Master':4,'Rare':5}
train_df['Title']=train_df['Title'].map(title_mapping).astype(int)
test_df['Title']=test_df['Title'].map(title_mapping).astype(int)

In [None]:
print('counts of missing value (train) =>', train_df['Title'].isnull().sum())
print('-'*40)
print(train_df['Title'].value_counts())
print('-'*40)
print('counts of missing value (test) =>', test_df['Title'].isnull().sum())
print('-'*40)
print(test_df['Title'].value_counts())

In [None]:
sex_mapping = {'male':1,'female':2}
train_df['Sex'] = train_df['Sex'].map(sex_mapping).astype(int)
test_df['Sex'] = test_df['Sex'].map(sex_mapping).astype(int)

In [None]:
print('counts of missing value (train) =>', train_df['Sex'].isnull().sum())
print('-'*40)
print(train_df['Sex'].value_counts())
print('-'*40)
print('counts of missing value (test) =>', test_df['Sex'].isnull().sum())
print('-'*40)
print(test_df['Sex'].value_counts())

In [None]:
train_df.groupby('Title')['Age'].median()

In [None]:
test_df.groupby('Title')['Age'].median()

In [None]:
train_df['Age']=train_df['Age'].fillna(train_df.groupby('Title')['Age'].transform('median'))
test_df['Age']=test_df['Age'].fillna(test_df.groupby('Title')['Age'].transform('median'))

In [None]:
train_df['Age'] = train_df['Age'].astype(int)
test_df['Age'] = test_df['Age'].astype(int)

In [None]:
train_df['AgeBand'] = pd.cut(train_df['Age'], 8)
train_df.groupby('AgeBand', as_index=False, observed=False)['Survived'].mean().sort_values(by = 'AgeBand', ascending=True)

In [None]:
train_df['AgeBand'] = pd.cut(train_df['Age'], bins = 8, labels = [0,1,2,3,4,5,6,7])
train_df.groupby('AgeBand', as_index=False,observed=False)['Survived'].mean().sort_values(by = 'AgeBand', ascending=True)

In [None]:
test_df.loc[(test_df['Age'] <= 10), 'AgeBand'] = 0
test_df.loc[(test_df['Age'] > 10) & (test_df['Age'] <= 20), 'AgeBand'] = 1
test_df.loc[(test_df['Age'] > 20) & (test_df['Age'] <= 30), 'AgeBand'] = 2
test_df.loc[(test_df['Age'] > 30) & (test_df['Age'] <= 40), 'AgeBand'] = 3
test_df.loc[(test_df['Age'] > 40) & (test_df['Age'] <= 50), 'AgeBand'] = 4
test_df.loc[(test_df['Age'] > 50) & (test_df['Age'] <= 60), 'AgeBand'] = 5
test_df.loc[(test_df['Age'] > 60) & (test_df['Age'] <= 70), 'AgeBand'] = 6
test_df.loc[(test_df['Age'] > 70), 'AgeBand'] = 7

In [None]:
train_df['Family'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['Family'] = test_df['SibSp'] + test_df['Parch'] + 1
train_df.groupby('Family', as_index=False)['Survived'].mean().sort_values(by='Family')

In [None]:
train_df['Alone'] = 0
train_df.loc[train_df['Family'] == 1, 'Alone'] = 1

test_df['Alone'] = 0
test_df.loc[test_df['Family'] == 1, 'Alone'] = 1

train_df.groupby('Alone', as_index=False)['Survived'].mean()

In [None]:
train_df['FareBand'] = pd.qcut(train_df['Fare'], q=8)
train_df.groupby('FareBand', as_index=False, observed=False)['Survived'].mean().sort_values(by = 'FareBand', ascending=True)

In [None]:
train_df['FareBand'] = pd.qcut(train_df['Fare'], q=8, labels=[0,1,2,3,4,5,6,7])
train_df.groupby('FareBand', as_index=False, observed=False)['Survived'].mean().sort_values(by = 'FareBand', ascending=True)

In [None]:
test_df['Fare']=test_df['Fare'].fillna(test_df['Fare'].dropna().median())

test_df.loc[(test_df['Fare'] <= 7.750), 'FareBand'] = 0
test_df.loc[(test_df['Fare'] > 7.750) & (test_df['Fare'] <= 7.910), 'FareBand'] = 1
test_df.loc[(test_df['Fare'] > 7.910) & (test_df['Fare'] <= 9.841), 'FareBand'] = 2
test_df.loc[(test_df['Fare'] > 9.841) & (test_df['Fare'] <= 14.454), 'FareBand'] = 3
test_df.loc[(test_df['Fare'] > 14.454) & (test_df['Fare'] <= 24.479), 'FareBand'] = 4
test_df.loc[(test_df['Fare'] > 24.479) & (test_df['Fare'] <= 31.000), 'FareBand'] = 5
test_df.loc[(test_df['Fare'] > 31.000) & (test_df['Fare'] <= 69.488), 'FareBand'] = 6
test_df.loc[(test_df['Fare'] > 69.488), 'FareBand'] = 7

In [None]:
test_df.head()

In [None]:
train_df.head()

In [None]:
label_column = ['Survived']
feature_columns = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Title','AgeBand','Family','Alone','FareBand']
train_feature_df = train_df[feature_columns]
test_feature_df = test_df[feature_columns]
train_label_df = train_df[label_column]
print('train shape = ', train_feature_df.shape, ',test shape =', test_feature_df.shape)

In [None]:
train_feature_df.head(2)

In [None]:
test_feature_df.head(2)

In [None]:
from sklearn.preprocessing import StandardScaler

feature_columns = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked',\
                   'Title','AgeBand','Family','Alone','FareBand']
scaler = StandardScaler()

train_feature_df_scaled = scaler.fit_transform(train_feature_df[feature_columns])
train_feature_df_scaled = pd.DataFrame(train_feature_df_scaled, columns = feature_columns)

test_feature_df_scaled = scaler.fit_transform(test_feature_df[feature_columns])
test_feature_df_scaled = pd.DataFrame(test_feature_df_scaled, columns = feature_columns)


In [None]:
train_feature_df_scaled.head(2)

In [None]:
x_train = train_feature_df_scaled.to_numpy().astype('float32')
x_test = test_feature_df_scaled.to_numpy().astype('float32')
y_train = train_label_df.to_numpy().astype('float32')

print('x_train.shape = ', x_train.shape,'x_test.shape = ', x_test.shape,'y_train.shape = ', y_train.shape)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping


model = Sequential()

model.add(Input(shape=(x_train.shape[1],)))  # 첫 번째 레이어로 Input 추가
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(2, activation='softmax'))

In [None]:
model.compile(optimizer=Adam(learning_rate = 0.0005), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# 베스트 모델 저장
checkpoint = ModelCheckpoint(
    "best_model.keras",          # 저장될 파일명
    monitor='val_accuracy',   # 검증 정확도(val_accuracy)를 기준으로 저장
    save_best_only=True,      # 최고 성능일 때만 저장
    mode='max',               # 높은 값이 더 좋은 경우 'max'
    verbose=1
)

# 조기 종료 (선택 사항)
early_stopping = EarlyStopping(
    monitor='val_loss',       # 검증 손실(val_loss)을 기준으로 조기 종료
    patience=15,              # 성능 향상이 없을 경우 10번의 에포크 후 종료
    verbose=1
)

# 3. 모델 학습
hist = model.fit(
    x_train, y_train,
    validation_split=0.2,     # 검증 데이터로 20%를 사용
    epochs=200,
    batch_size=16,
    callbacks=[checkpoint, early_stopping]  # 콜백 추가
)

# hist = model.fit(x_train, y_train, epochs=200)

In [None]:
import matplotlib.pyplot as plt

# history 객체에서 학습 기록 불러오기
train_loss = hist.history['loss']          # 훈련 손실
val_loss = hist.history['val_loss']        # 검증 손실
train_accuracy = hist.history['accuracy']  # 훈련 정확도
val_accuracy = hist.history['val_accuracy']  # 검증 정확도

# 1. 손실 시각화
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)  # 두 개의 그래프를 나란히 표시
plt.plot(train_loss, label='Train Loss', color='blue', linestyle='-')
plt.plot(val_loss, label='Validation Loss', color='orange', linestyle='--')
plt.title('Loss Over Epochs', fontsize=14)
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend()
plt.grid(True)

# 2. 정확도 시각화
plt.subplot(1, 2, 2)
plt.plot(train_accuracy, label='Train Accuracy', color='blue', linestyle='-')
plt.plot(val_accuracy, label='Validation Accuracy', color='orange', linestyle='--')
plt.title('Accuracy Over Epochs', fontsize=14)
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
# survived_prediction = model.predict(x_test)
# print(survived_prediction.shape)

# 4. 베스트 모델 로드 및 테스트 데이터 예측
from tensorflow.keras.models import load_model

best_model = load_model("best_model.keras")  # 저장된 베스트 모델 로드
survived_prediction = best_model.predict(x_test)  # 테스트 데이터 예측
print(survived_prediction.shape)  # 결과 출력

# 5. 예측 결과 변환 (이진 분류 예제)
predicted_classes = survived_prediction.argmax(axis=1)  # 확률이 가장 높은 클래스 선택
print(predicted_classes)

In [None]:
# survived_prediction_digit = np.argmax(survived_prediction, axis=1)
# gender_df['Survived'] = survived_prediction_digit
# gender_df.to_csv('submission.csv', index=False)


# # 예측된 확률에서 가장 높은 값을 가진 인덱스를 선택하여 클래스를 예측
# survived_prediction_digit = np.argmax(survived_prediction, axis=1)

# test_feature_df에서 PassengerId 가져오기
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],  # test_df에서 PassengerId 가져오기
    'Survived': predicted_classes  # 예측된 Survived
})

# CSV 파일로 저장 (Kaggle 제출 파일 형식)
submission_df.to_csv('submission.csv', index=False)

print(submission_df.tail())  # 결과를 출력하여 확인