In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing 

def prepare_data(df):
    # df=df_data.drop(['name'], axis=1)  #删除name列
    # 缺失值处理
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)# 为na的age填充均值
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['embarked'] = df['embarked'].fillna('S')
    # 数值都转成int
    df['sex'] = df['sex'].map({'male':1, 'female':0}).astype(int)
    df['embarked'] = df['embarked'].map({'C':0, 'Q':1,'S':2}).astype(int)
    
    #分离特征值和标签值
    ndarray_data = df.values
    features = ndarray_data[:,1:]
    label = ndarray_data[:,0]
    
    # 特征值标准化
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
    norm_features = minmax_scale.fit_transform(features)
    
    return norm_features, label
    
    

In [2]:
data_file_path = "data/titanic3.xls"
df_data = pd.read_excel(data_file_path)

# 筛选字段
# 提取有用的特征字段
selected_cols = ['survived', 'pclass','sex','age','sibsp','parch','fare','embarked']

selected_df_data = df_data[selected_cols]

In [3]:
# 通过pandas sample进行随机排序， frac 百分比 。 selected_df_data 保持不变
shuffle_data = selected_df_data.sample(frac=1)
x_data, y_data = prepare_data(shuffle_data)

# 划分训练集和测试集，这里80%作为训练集
train_size = int(len(x_data)*0.8)

x_train = x_data[:train_size]
y_train = y_data[:train_size]
x_test = x_data[train_size:]
y_test = y_data[train_size:]

In [4]:
# 输入层 - 隐藏层1 64个 - 隐藏层2 32个 - 输出层 1个神经元
import tensorflow as tf
# 建立Keras序列模型
model = tf.keras.models.Sequential()

In [5]:
# 这段代码比低阶api节省了好多事
# 第一隐藏层层，输入特征数据是7列，也可用input_shape(7,)
model.add(tf.keras.layers.Dense(units=64,  # 输出神经元数量
                                input_dim=7, #输出数据个数
                                use_bias=True, #启用偏置项
                                kernel_initializer='uniform', #权重初始化方式
                                bias_initializer='zeros', #偏置项初识为0
                                activation='relu')) # 激活函数用relu

# 第二隐藏层，这里很多都可以用缺省值。输入数据维度会由上一个层计算得到
model.add(tf.keras.layers.Dense(units=32, activation='sigmoid'))

# 输出层
model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

#显示模型结构
model.summary()
# 其中 参数个数，因为有偏置项，  (7+1)*64， (64+1) *32, (32+1)*1


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                512       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 2,625
Trainable params: 2,625
Non-trainable params: 0
_________________________________________________________________


In [6]:
# 模型设置
model.compile(optimizer=tf.keras.optimizers.Adam(0.003),  # 优化器
             loss='binary_crossentropy', # 损失函数
             metrics=['accuracy']) # 监控度量值=准确率
# sigmoid 作为激活函数时，损失函数常用binary_crossentropy
# softmax 作为激活函数时，损失函数选用categorical_crossentropy



In [7]:
# 训练模型
train_history = model.fit(x = x_train,
                          y = y_train,
                          validation_split = 0.2, # 8/2开拆分样本集和测试集，验证集所占比例
                          epochs = 100,
                          batch_size = 40,
                          verbose = 2)  # 训练过程 0 - 不显示,1 - 进度条  ,2 - 每个epoch显示一行


# 返回值是训练过程的loss和acc数据，以及验证过程（如有）
# 可以通过.history 查看
# print(train_history.history)

Train on 837 samples, validate on 210 samples
Epoch 1/100
837/837 - 1s - loss: 0.6529 - accuracy: 0.6129 - val_loss: 0.6288 - val_accuracy: 0.6095
Epoch 2/100
837/837 - 0s - loss: 0.5935 - accuracy: 0.6738 - val_loss: 0.5562 - val_accuracy: 0.7810
Epoch 3/100
837/837 - 0s - loss: 0.5247 - accuracy: 0.7706 - val_loss: 0.5071 - val_accuracy: 0.7667
Epoch 4/100
837/837 - 0s - loss: 0.4946 - accuracy: 0.7766 - val_loss: 0.4946 - val_accuracy: 0.7952
Epoch 5/100
837/837 - 0s - loss: 0.4848 - accuracy: 0.7790 - val_loss: 0.4864 - val_accuracy: 0.7762
Epoch 6/100
837/837 - 0s - loss: 0.4792 - accuracy: 0.7790 - val_loss: 0.4817 - val_accuracy: 0.7810
Epoch 7/100
837/837 - 0s - loss: 0.4751 - accuracy: 0.7778 - val_loss: 0.4762 - val_accuracy: 0.7857
Epoch 8/100
837/837 - 0s - loss: 0.4720 - accuracy: 0.7826 - val_loss: 0.4748 - val_accuracy: 0.7810
Epoch 9/100
837/837 - 0s - loss: 0.4682 - accuracy: 0.7897 - val_loss: 0.4777 - val_accuracy: 0.7810
Epoch 10/100
837/837 - 0s - loss: 0.4686 - ac

Epoch 81/100
837/837 - 0s - loss: 0.4335 - accuracy: 0.8041 - val_loss: 0.4407 - val_accuracy: 0.7952
Epoch 82/100
837/837 - 0s - loss: 0.4331 - accuracy: 0.8029 - val_loss: 0.4396 - val_accuracy: 0.8095
Epoch 83/100
837/837 - 0s - loss: 0.4333 - accuracy: 0.8076 - val_loss: 0.4396 - val_accuracy: 0.8048
Epoch 84/100
837/837 - 0s - loss: 0.4385 - accuracy: 0.8041 - val_loss: 0.4417 - val_accuracy: 0.8095
Epoch 85/100
837/837 - 0s - loss: 0.4327 - accuracy: 0.8100 - val_loss: 0.4418 - val_accuracy: 0.7952
Epoch 86/100
837/837 - 0s - loss: 0.4341 - accuracy: 0.8017 - val_loss: 0.4389 - val_accuracy: 0.8048
Epoch 87/100
837/837 - 0s - loss: 0.4345 - accuracy: 0.8041 - val_loss: 0.4398 - val_accuracy: 0.8048
Epoch 88/100
837/837 - 0s - loss: 0.4322 - accuracy: 0.8100 - val_loss: 0.4392 - val_accuracy: 0.8095
Epoch 89/100
837/837 - 0s - loss: 0.4318 - accuracy: 0.7993 - val_loss: 0.4395 - val_accuracy: 0.8143
Epoch 90/100
837/837 - 0s - loss: 0.4305 - accuracy: 0.8088 - val_loss: 0.4380 - v

In [8]:
# 模型评估
evaluate_result = model.evaluate(x=x_test, y=y_test)

print(evaluate_result)

print(model.metrics_names)


[0.371098417703432, 0.82824427]
['loss', 'accuracy']


In [9]:
# 模型预测
Jack_info = [0, 3, 'male', 23, 1, 0, 5.0, 'S']
Rose_info = [1, 1, 'female', 20, 1, 0, 100.0, 'S']

# 创建旅客DataFrame
new_passenger_pd = pd.DataFrame([Jack_info, Rose_info], columns=selected_cols)

# 在老的DataFrame中加入新旅客信息，即数据汇总
all_passenger_pd = selected_df_data.append(new_passenger_pd)

# 数据预处理
x_features,y_label = prepare_data(all_passenger_pd)

# 利用模型计算旅客生存概率
surv_probability = model.predict(x_features)

# print(surv_probability[:5])

#在数据表中插入生存概率
all_passenger_pd.insert(len(all_passenger_pd.columns), 'surv_probability', surv_probability)
print(all_passenger_pd[-2:])



   survived  pclass  sex   age  sibsp  parch   fare  embarked  \
0         0       3    1  23.0      1      0    5.0         2   
1         1       1    0  20.0      1      0  100.0         2   

   surv_probability  
0          0.127346  
1          0.976578  
