In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense

import seaborn as sns

# "conda install scikit-learn"을 통해 다운로드
# 문자열 형태의 예측 결과를 숫자 형태로 바꾸어주는 라이브러리
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

In [2]:
# 타이타닉에 탑승한 사람들의 신상정보를 활용하여, 승선한 사람들의 생존 여부 예측

data = sns.load_dataset('titanic')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
list(data)

['survived',
 'pclass',
 'sex',
 'age',
 'sibsp',
 'parch',
 'fare',
 'embarked',
 'class',
 'who',
 'adult_male',
 'deck',
 'embark_town',
 'alive',
 'alone']

In [4]:
data = data[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch','fare', \
             'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']]
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [5]:
# 문자 형태의 원인 데이터를 숫자로 변경

dic = {
    'male' : 1, 'female' : 0,
    'First' : 1, 'Second' : 2, 'Third' : 3,
    'man' : 2, 'woman' : 1, 'child' : 0,
    True : 1, False : 0,
    'Southampton' : 0, 'Cherbourg' : 1, 'Queenstown' : 2,
    'yes' : 1, 'no' : 0 
}

for i in range(ord('A'), ord('Z') + 1):
    dic[chr(i)] = i - ord('A')

dic

{'male': 1,
 'female': 0,
 'First': 1,
 'Second': 2,
 'Third': 3,
 'man': 2,
 'woman': 1,
 'child': 0,
 True: 1,
 False: 0,
 'Southampton': 0,
 'Cherbourg': 1,
 'Queenstown': 2,
 'yes': 1,
 'no': 0,
 'A': 0,
 'B': 1,
 'C': 2,
 'D': 3,
 'E': 4,
 'F': 5,
 'G': 6,
 'H': 7,
 'I': 8,
 'J': 9,
 'K': 10,
 'L': 11,
 'M': 12,
 'N': 13,
 'O': 14,
 'P': 15,
 'Q': 16,
 'R': 17,
 'S': 18,
 'T': 19,
 'U': 20,
 'V': 21,
 'W': 22,
 'X': 23,
 'Y': 24,
 'Z': 25}

In [8]:
dataset = data.values
x = dataset[:, 1:]

for i in range(len(x)):
    for j in range(len(x[i])):
        if x[i][j] in dic:
            key = x[i][j]
            x[i][j] = dic[key]
        
        if str(x[i][j]) == 'nan':
            x[i][j] = 0

for i in x[0]:
    print(str(i), type(i))
print('=' * 30)

3 <class 'int'>
1 <class 'int'>
22.0 <class 'float'>
1 <class 'int'>
0 <class 'int'>
7.25 <class 'float'>
18 <class 'int'>
3 <class 'int'>
2 <class 'int'>
1 <class 'int'>
0 <class 'int'>
0 <class 'int'>
0 <class 'int'>
0 <class 'int'>


In [9]:
y = dataset[:, 0]
y

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
for i in x_train:
    print(i)

[1 1 51.0 0 0 26.55 18 1 2 1 4 0 1 1]
[1 0 49.0 1 0 76.7292 2 1 1 0 3 1 1 0]
[3 1 1 5 2 46.9 18 3 0 0 0 0 0 0]
[1 1 54.0 0 1 77.2875 18 1 2 1 3 0 0 0]
[3 0 0 1 0 14.4583 2 3 1 0 0 1 0 0]
[2 1 8.0 1 1 36.75 18 2 0 0 0 0 1 0]
[1 1 24.0 0 1 247.5208 2 1 2 1 1 1 0 0]
[3 1 20.0 0 0 8.05 18 3 2 1 0 0 0 1]
[3 1 30.0 0 0 8.05 18 3 2 1 0 0 0 1]
[3 1 24.0 0 0 7.1417 18 3 2 1 0 0 1 1]
[3 0 0 0 0 7.8292 16 3 1 0 0 2 1 1]
[3 0 0 0 0 7.55 18 3 1 0 0 0 0 1]
[3 1 25.0 0 0 7.25 18 3 2 1 0 0 0 1]
[3 0 0 1 0 15.5 16 3 1 0 0 2 1 0]
[1 1 0 0 0 35.0 18 1 2 1 2 0 0 1]
[1 0 48.0 0 0 25.9292 18 1 1 0 3 0 1 1]
[3 1 74.0 0 0 7.775 18 3 2 1 0 0 0 1]
[3 1 23.0 0 0 7.8958 18 3 2 1 0 0 0 1]
[3 1 17.0 1 1 7.2292 2 3 2 1 0 1 0 0]
[3 0 0 3 1 25.4667 18 3 1 0 0 0 0 0]
[2 1 52.0 0 0 13.5 18 2 2 1 0 0 0 1]
[1 1 0 0 0 52.0 18 1 2 1 0 0 0 1]
[2 1 0 0 0 13.0 18 2 2 1 0 0 1 1]
[2 1 34.0 1 0 21.0 18 2 2 1 0 0 0 0]
[3 0 0 0 2 22.3583 2 3 1 0 0 1 1 0]
[2 0 34.0 1 1 32.5 18 2 1 0 0 0 1 0]
[2 0 24.0 0 0 13.0 18 2 1 0 5 0 1 1]
[2 1

[1 0 17.0 1 0 57.0 18 1 1 0 1 0 1 0]
[3 1 0 0 0 24.15 16 3 2 1 0 2 0 1]
[3 1 25.0 0 0 7.8958 18 3 2 1 0 0 0 1]
[3 0 30.0 0 0 8.6625 18 3 1 0 0 0 0 1]
[2 1 0.83 1 1 18.75 18 2 0 0 0 0 1 0]
[3 0 45.0 0 0 7.75 18 3 1 0 0 0 0 1]
[1 1 0 0 0 26.0 18 1 2 1 0 0 0 1]
[3 1 0 0 0 7.25 18 3 2 1 0 0 0 1]
[2 0 28.0 1 0 26.0 18 2 1 0 0 0 1 0]
[3 1 28.0 0 0 7.8542 18 3 2 1 0 0 0 1]
[2 1 21.0 2 0 73.5 18 2 2 1 0 0 0 0]
[1 1 33.0 0 0 5.0 18 1 2 1 1 0 0 1]
[3 1 39.0 1 5 31.275 18 3 2 1 0 0 0 0]
[2 1 54.0 0 0 26.0 18 2 2 1 0 0 0 1]
[3 1 29.0 0 0 7.8958 2 3 2 1 0 1 1 1]
[1 0 45.0 1 1 164.8667 18 1 1 0 0 0 1 0]
[3 1 44.0 0 1 16.1 18 3 2 1 0 0 0 0]
[3 1 0 0 0 9.5 18 3 2 1 0 0 0 1]
[3 1 0 0 0 7.8958 18 3 2 1 0 0 0 1]
[3 1 32.0 0 0 7.925 18 3 2 1 0 0 0 1]
[3 1 20.0 1 0 7.925 18 3 2 1 0 0 1 0]
[1 0 58.0 0 0 146.5208 2 1 1 0 1 1 1 1]
[2 1 19.0 0 0 13.0 18 2 2 1 0 0 0 1]
[1 1 45.0 0 0 26.55 18 1 2 1 1 0 0 1]
[3 1 1 4 1 39.6875 18 3 0 0 0 0 0 0]
[3 0 21.0 0 0 7.65 18 3 1 0 0 0 1 1]
[3 1 2.0 4 1 29.125 16 3 0 0 0 2

[3 0 0 0 0 7.8792 16 3 1 0 0 2 1 1]
[1 1 42.0 0 0 26.2875 18 1 2 1 4 0 1 1]
[1 0 36.0 1 2 120.0 18 1 1 0 1 0 1 0]
[1 0 39.0 1 1 83.1583 2 1 1 0 4 1 1 0]
[3 0 19.0 1 0 7.8542 18 3 1 0 0 0 1 0]
[3 1 0 0 0 7.7333 16 3 2 1 0 2 0 1]
[3 0 36.0 1 0 17.4 18 3 1 0 0 0 1 0]
[2 1 60.0 1 1 39.0 18 2 2 1 0 0 0 0]


In [11]:
x_train.shape, y_train.shape

((623, 14), (623,))

In [15]:
model = Sequential()

# 모델의 입출력 형태 설정

model.add(tf.keras.Input(shape = (623,14)))
model.add(Dense(24, input_dim = 14, activation = 'relu'))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy',
             optimizer='adam',
             metrics = ['accuracy']
             )

model.fit(x_train.astype(float), y_train.astype(float), epochs=10, batch_size=5, verbose = False)
model.fit(x_train.astype(float), y_train.astype(float), epochs=1, batch_size=1)



<tensorflow.python.keras.callbacks.History at 0x20b45195910>

In [16]:
accuracy = model.evaluate(x_test.astype(float), y_test.astype(float))
print('\nAccuracy : %.4f' % accuracy[1])


Accuracy : 0.9888


In [18]:
# 모델 사용하기

data = model.predict(x_test.astype(float))
preidct = []

for i in range(len(data)):
    preidct = preidct + [1 if data[i][0] >= 0.5 else 0]

print(preidct[:10]) # 예측
print(list(y_test[:10])) # 실제 결과

[0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


In [19]:
# 모델을 파일로 저장하기

model.save('models/titanic.h5')

In [20]:
del model # 제대로 된 테스트를 위해 메모리 내의 모델 삭제

In [22]:
# 저장한 모델 불러와서 사용해보기

model = load_model('models/titanic.h5')
accuracy = model.evaluate(x_test.astype(float), y_test.astype(float))
print('\nAccuracy : %.4f' % accuracy[1])

for i in accuracy:
    print(i)
    print()


Accuracy : 0.9888
0.06872429698705673

0.9888059496879578



In [23]:
data = model.predict(x_test.astype(float))
preidct = []

for i in range(len(data)):
    preidct = preidct + [1 if data[i][0] >= 0.5 else 0]

print(preidct[:10]) # 예측
print(list(y_test[:10])) # 실제 결과

[0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
