In [1]:
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

import sklearn
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import Adam
from sklearn import preprocessing


In [2]:
#匯入模型
from keras.models import load_model
model = load_model('dnn_for_titanic.h5')

In [3]:
#資料預處理函式
def pre_processing(dataset):
    # 取出Mr., Mrs., Miss
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    ##titlemapping
    result = []
    title_mapping = {"Mr": 0, 'Miss':1, 'Mrs':2}
    for items in dataset['Title']:
        result.append(title_mapping.get(items, 3))
    dataset['Title'] = result
    # 刪除Name欄位
    dataset.drop('Name', axis = 1, inplace=True)
    # mapping sex
    sex_mapping = {'male':0, 'female':1}
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)
    # Age欄位NA補值
    dataset["Age"].fillna(dataset.groupby("Title")["Age"].transform("median"), inplace = True)
    #依年齡區間做mapping function
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 62), 'Age'] = 4
    #填補登船地點缺失資料
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    # 登船地點的Mapping Function
    embarked_mapping = {'S':0, 'C':1, 'Q':2}
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)
    #填補票價缺失的資料 
    dataset['Fare'].fillna(dataset.groupby('Pclass')['Fare'].transform('median'), inplace = True)
    #依票價區間做mapping function
    dataset.loc[dataset['Fare'] <= 17, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2
    dataset.loc[(dataset['Fare'] > 100), 'Fare'] = 3
    #觀察船艙編號資料 =>取出第一個字母
    dataset['Cabin'] = dataset['Cabin'].str[:1]
    #船艙Mapping
    cabin_mapping = {'A':0, 'B':0.4, 'C':0.8, 'D':1.2, 'E':1.6, 
                     'F':2, 'G':2.4, 'T':2.8}
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)
    #填補船艙種類缺失資料
    dataset['Cabin'].fillna(dataset.groupby('Pclass')['Cabin'].transform('median'), inplace = True)
    #合併Sibsp & Parch
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    #家族人口的Mapping Function
    family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2,
                      7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
    dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)
    #刪除Sibsp, Parch, Ticket
    features_drop = ['Ticket', 'SibSp', 'Parch', 'PassengerId']
    dataset = dataset.drop(features_drop, axis = 1)
    return dataset

In [5]:
test = pd.read_csv('input/test.csv')
test = pre_processing(test)
test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,FamilySize
0,3,0,2.0,0.0,2.0,2,0,0.0
1,3,1,3.0,0.0,2.0,0,2,0.4
2,2,0,3.0,0.0,2.0,2,0,0.0
3,3,0,2.0,0.0,2.0,0,0,0.0
4,3,1,1.0,0.0,2.0,0,2,0.8


In [10]:
#確認資料型態
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Sex         418 non-null    int64  
 2   Age         418 non-null    float64
 3   Fare        418 non-null    float64
 4   Cabin       418 non-null    float64
 5   Embarked    418 non-null    int64  
 6   Title       418 non-null    int64  
 7   FamilySize  418 non-null    float64
dtypes: float64(4), int64(4)
memory usage: 26.2 KB


In [6]:
#標準化
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range = (0, 1))
scaleFeatures = minmax_scale.fit_transform(test)

In [12]:
#預測
probability = model.predict(scaleFeatures)
probability

array([[0.08448359],
       [0.29410973],
       [0.1222679 ],
       [0.08110017],
       [0.41510123],
       [0.21977091],
       [0.6295325 ],
       [0.22851619],
       [0.6052897 ],
       [0.11686662],
       [0.08110017],
       [0.23529735],
       [0.95029086],
       [0.07038307],
       [0.9576793 ],
       [0.911693  ],
       [0.17998296],
       [0.12573102],
       [0.34677452],
       [0.40347728],
       [0.30287033],
       [0.25890765],
       [0.9598245 ],
       [0.48726368],
       [0.9525358 ],
       [0.05262923],
       [0.9665159 ],
       [0.12573102],
       [0.29060876],
       [0.07619697],
       [0.09927648],
       [0.2547053 ],
       [0.3021417 ],
       [0.23251674],
       [0.38814837],
       [0.12573102],
       [0.4823275 ],
       [0.4823275 ],
       [0.11747652],
       [0.10996726],
       [0.05263063],
       [0.31501272],
       [0.05627653],
       [0.8467639 ],
       [0.95702887],
       [0.11747652],
       [0.32800364],
       [0.084

In [13]:
test_dataset = pd.read_csv('input/test.csv')
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [15]:
for i in range(len(test_dataset['PassengerId'])):
    print(i, ". ", test_dataset['PassengerId'][i], " ",int(round(probability[i, 0], 0)))
                                                           

0 .  892   0
1 .  893   0
2 .  894   0
3 .  895   0
4 .  896   0
5 .  897   0
6 .  898   1
7 .  899   0
8 .  900   1
9 .  901   0
10 .  902   0
11 .  903   0
12 .  904   1
13 .  905   0
14 .  906   1
15 .  907   1
16 .  908   0
17 .  909   0
18 .  910   0
19 .  911   0
20 .  912   0
21 .  913   0
22 .  914   1
23 .  915   0
24 .  916   1
25 .  917   0
26 .  918   1
27 .  919   0
28 .  920   0
29 .  921   0
30 .  922   0
31 .  923   0
32 .  924   0
33 .  925   0
34 .  926   0
35 .  927   0
36 .  928   0
37 .  929   0
38 .  930   0
39 .  931   0
40 .  932   0
41 .  933   0
42 .  934   0
43 .  935   1
44 .  936   1
45 .  937   0
46 .  938   0
47 .  939   0
48 .  940   1
49 .  941   0
50 .  942   0
51 .  943   0
52 .  944   1
53 .  945   1
54 .  946   0
55 .  947   0
56 .  948   0
57 .  949   0
58 .  950   0
59 .  951   1
60 .  952   0
61 .  953   0
62 .  954   0
63 .  955   1
64 .  956   1
65 .  957   1
66 .  958   1
67 .  959   0
68 .  960   0
69 .  961   1
70 .  962   1
71 .  963   0
72

In [16]:
ar_tmp = []
for i in range(len(test_dataset['PassengerId'])):
    ar_tmp.append(int(round(probability[i, 0], 0)))
print(ar_tmp)
     

[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 

In [18]:
submission = pd.DataFrame({'PassengerId': test_dataset['PassengerId'],
                           'Survived':ar_tmp
                          })
submission.to_csv('submission.csv', index = False)
                           