# 타이타닉 데이터 이진분류 정확도 경시대회

In [1]:
import seaborn as sns
dataset_names = sns.get_dataset_names()
titanic = sns.load_dataset('titanic')

In [2]:
df = sns.load_dataset('titanic')
df.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)

In [4]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


# 1. 데이터 전처리

- Feature selection

In [5]:
df = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked','deck']]

- 결측치 확인

In [6]:
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
deck        688
dtype: int64

In [7]:
# age 칼럼은 평균으로 대체
df.age.fillna(df.age.mean(), inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [8]:
# embarked 컬럼은 최빈값으로 대체
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [9]:
df.embarked.fillna('S', inplace =True)
df.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
deck        688
dtype: int64

In [10]:
# deck 컬럼은 삭제
df.drop(columns = ['deck'], inplace = True)
df.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

- 카테고리 값인 sex, embarked 컬럼은 숫자로 변환

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [12]:
df.sex = le.fit_transform(df.sex)
df.embarked = le.fit_transform(df.embarked)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


2. Train/Test dataset으로 분리

In [13]:
# X와 y를 넘파이 배열로
X = df.iloc[:,1:].values
y = df.survived.values
X.shape, y.shape

((891, 7), (891,))

In [14]:
# y값의 분포
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2022
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

In [16]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([439, 273]))

- 모델 저장조건 설정

In [17]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [18]:
import os
if not os.path.exists('model'):
    os.mkdir('model')

In [19]:
model_path = 'model/titanic.h5'
checkpoint = ModelCheckpoint(
    model_path, monitor= 'val_loss', verbose = 1, save_best_only = True
)

- X값 StandardScaler

In [20]:
from sklearn.preprocessing import StandardScaler
X_scaled = StandardScaler().fit_transform(X)
X_scaled.shape

(891, 7)

In [21]:
y.shape

(891,)

- 모델 정의/설정/학습평가

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [23]:
model = Sequential([
    Dense(30, input_dim=7, activation= 'relu'),  # Hiddem Layer
    Dense(15, activation= 'relu'),
    Dense(1, activation= 'sigmoid')              # Output Layer
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 30)                240       
                                                                 
 dense_1 (Dense)             (None, 15)                465       
                                                                 
 dense_2 (Dense)             (None, 1)                 16        
                                                                 
Total params: 721
Trainable params: 721
Non-trainable params: 0
_________________________________________________________________


- 실행 환경 설정(complie)

In [24]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(patience = 30)

In [25]:
model.compile(
    optimizer= 'adam',
    loss = 'binary_crossentropy',        
    metrics=['accuracy']                       

)

In [26]:
hist = model.fit(
    X,  y, validation_split = 0.2, epochs =300, 
    batch_size = 50, verbose =0, callbacks=[checkpoint, early_stopping]
)


Epoch 00001: val_loss improved from inf to 1.18278, saving model to model/titanic.h5

Epoch 00002: val_loss improved from 1.18278 to 0.87903, saving model to model/titanic.h5

Epoch 00003: val_loss improved from 0.87903 to 0.63671, saving model to model/titanic.h5

Epoch 00004: val_loss improved from 0.63671 to 0.55104, saving model to model/titanic.h5

Epoch 00005: val_loss improved from 0.55104 to 0.51989, saving model to model/titanic.h5

Epoch 00006: val_loss improved from 0.51989 to 0.50954, saving model to model/titanic.h5

Epoch 00007: val_loss did not improve from 0.50954

Epoch 00008: val_loss improved from 0.50954 to 0.49698, saving model to model/titanic.h5

Epoch 00009: val_loss did not improve from 0.49698

Epoch 00010: val_loss did not improve from 0.49698

Epoch 00011: val_loss did not improve from 0.49698

Epoch 00012: val_loss improved from 0.49698 to 0.47891, saving model to model/titanic.h5

Epoch 00013: val_loss did not improve from 0.47891

Epoch 00014: val_loss d

In [27]:
from tensorflow.keras.models import load_model
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.3899904489517212, 0.8491619825363159]