In [None]:
#匯入套件與模組
import numpy
import pandas as pd
from sklearn import preprocessing
numpy.random.seed(10)

In [None]:
#下載資料集
import urllib.request 
import os
url="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath="titanic3.xls"
if not os.path.isfile(filepath):
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

In [14]:
#讀取資料集
all_df = pd.read_excel("titanic3.xls")

In [15]:
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [16]:
cols=['survived','name','pclass' ,'sex', 'age', 'sibsp',
      'parch', 'fare', 'embarked']
all_df=all_df[cols]

In [17]:
# 依8:2比例將資料分成訓練資料與測試資料
msk = numpy.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]

In [18]:
print('total:',len(all_df),
      'train:',len(train_df),
      'test:',len(test_df),)

total: 1309 train: 1034 test: 275


In [19]:
#資料前處理
def PreprocessData(all_df):
    df=all_df.drop(['name'], axis=1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex'] = df['sex'].map({'female':0, 'male':1}).astype(int)
    x_OneHot_df = pd.get_dummies(data=df,columns=["embarked"])
    
    ndarray = x_OneHot_df.values
    Features = ndarray[:,1:]
    Label = ndarray[:,0]
    
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
    scaledFeatures=minmax_scale.fit_transform(Features)
    
    return scaledFeatures,Label
    

In [20]:
train_Features,train_Label = PreprocessData(train_df)
test_Features,test_Label = PreprocessData(test_df)

# Build Model

In [46]:
#匯入keras模組
from keras.models import Sequential
from keras.layers import Dense,Dropout

In [156]:
#建立Keras Sequentail模型
model = Sequential()

In [157]:
#建立Dense層
#輸出的神經元個數為40
#輸入的神經元個數為9(9個features)
#設定kernel_initializer 為 uniform distibution設定之亂數，初始化weight及bias
#設定Activation Function 為relu
model.add(Dense(units=50,input_dim=9,
               kernel_initializer='uniform',
               activation='relu'))

In [158]:
#建立Dense層
#輸出的神經元個數為30
#輸入的神經元個數為 上一層之輸出層（預設）
#設定kernel_initializer 為 uniform distibution設定之亂數，初始化weight及bias
#設定Activation Function 為relu
model.add(Dense(units=50,
                kernel_initializer='uniform',
               activation='relu'))

In [159]:
#建立Dense層
#輸出的神經元個數為1(最後的結果)
#輸入的神經元個數為 上一層之輸出層（預設）
#設定kernel_initializer 為 uniform distibution設定之亂數，初始化weight及bias
#設定Activation Function 為 sigmoid
model.add(Dense(units=1,
                kernel_initializer='uniform',
               activation='sigmoid'))

In [160]:
#定義模型訓練方式：
#設定loss function
#optimizer 設定優化器
#設定評估模型的方式為 accuracy
model.compile(loss = 'binary_crossentropy',
             optimizer='adam',metrics= ['accuracy'])

In [161]:
model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_39 (Dense)             (None, 50)                500       
_________________________________________________________________
dense_40 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_41 (Dense)             (None, 1)                 51        
Total params: 3,101
Trainable params: 3,101
Non-trainable params: 0
_________________________________________________________________


In [162]:
#設定訓練資料參數
#設定訓言資料與驗證資料比例
#設定epoch訓練週期與batch_size每一批次訓練筆數
#verbose設定顯示訓練過程
train_history = model.fit(
    x=train_Features,
    y=train_Label,
    validation_split = 0.1,
    epochs = 100,
    batch_size = 16,
    verbose=2)

Epoch 1/100
59/59 - 0s - loss: 0.6761 - accuracy: 0.6237 - val_loss: 0.5833 - val_accuracy: 0.7885
Epoch 2/100
59/59 - 0s - loss: 0.5699 - accuracy: 0.7237 - val_loss: 0.4443 - val_accuracy: 0.7981
Epoch 3/100
59/59 - 0s - loss: 0.4965 - accuracy: 0.7656 - val_loss: 0.4648 - val_accuracy: 0.7885
Epoch 4/100
59/59 - 0s - loss: 0.4848 - accuracy: 0.7720 - val_loss: 0.4342 - val_accuracy: 0.7692
Epoch 5/100
59/59 - 0s - loss: 0.4792 - accuracy: 0.7613 - val_loss: 0.4321 - val_accuracy: 0.8173
Epoch 6/100
59/59 - 0s - loss: 0.4744 - accuracy: 0.7785 - val_loss: 0.4294 - val_accuracy: 0.8173
Epoch 7/100
59/59 - 0s - loss: 0.4674 - accuracy: 0.7720 - val_loss: 0.4221 - val_accuracy: 0.8269
Epoch 8/100
59/59 - 0s - loss: 0.4662 - accuracy: 0.7828 - val_loss: 0.4221 - val_accuracy: 0.7981
Epoch 9/100
59/59 - 0s - loss: 0.4644 - accuracy: 0.7720 - val_loss: 0.4224 - val_accuracy: 0.8269
Epoch 10/100
59/59 - 0s - loss: 0.4615 - accuracy: 0.7828 - val_loss: 0.4188 - val_accuracy: 0.8269
Epoch 11/

In [163]:
#評估模型準確率
scores = model.evaluate(x = test_Features,y = test_Label)



In [164]:
scores[1]

0.8145454525947571

# Prediction

In [165]:
#匯入自定義的人物data
Jack = pd.Series([0,'Jack',3,'male',23,1,0,5.0000,'S'])
Rose = pd.Series([1,'Rose',1,'female',20,1,0,100.0000,'S'])
JR_df = pd.DataFrame([list(Jack),list(Rose)],
                    columns=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked'])

AttributeError: 'DataFrame' object has no attribute 'Series'

In [166]:
all_df = pd.concat([all_df,JR_df])

AttributeError: 'DataFrame' object has no attribute 'concat'

In [167]:
all_df[-2:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked,probability
0,0,Jack,3,male,23.0,1,0,5.0,S,0.241468
1,1,Rose,1,female,20.0,1,0,100.0,S,0.976595


In [168]:
all_Features,Label = PreprocessData(all_df)

AttributeError: 'DataFrame' object has no attribute 'get_dummies'

In [152]:
#透過剛剛訓練好的模型預測其存活率
all_probability = model.predict(all_Features)

In [153]:
all_probability[:10]

array([[0.97798055],
       [0.8275585 ],
       [0.98185694],
       [0.28852934],
       [0.98112535],
       [0.24379593],
       [0.96958303],
       [0.37462598],
       [0.97301996],
       [0.32171267]], dtype=float32)

In [154]:
pd=all_df

In [155]:
pd.insert(len(all_df.columns),
          'probability',all_probability)

ValueError: cannot insert probability, already exists

In [141]:
#最後一欄為其成果
pd[-2:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked,probability
0,0,Jack,3,male,23.0,1,0,5.0,S,0.241468
1,1,Rose,1,female,20.0,1,0,100.0,S,0.976595
