In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=6, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.18.1
pandas 1.0.0
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


## 读取数据并转换

In [2]:
# 读取csv文件成为pandas的DataFrame
train_file = './data/titanic/train.csv'
eval_file = './data/titanic/eval.csv'

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())  # head函数去除前几条数据，默认前5条
print(eval_df.head())

   survived     sex   age  n_siblings_spouses  parch     fare  class     deck  \
0         0    male  22.0                   1      0   7.2500  Third  unknown   
1         1  female  38.0                   1      0  71.2833  First        C   
2         1  female  26.0                   0      0   7.9250  Third  unknown   
3         1  female  35.0                   1      0  53.1000  First        C   
4         0    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
   survived     sex   age  n_siblings_spouses  parch     fare   class  \
0         0    male  35.0                   0      0   8.0500   Third   
1         0    male  54.0                   0      0  51.8625   First   
2         1  female  58.0                   0      0  26.5500   First   
3         1  female  55.0                   0      0  16.0000  Second   
4         

In [3]:
y_train = train_df.pop('survived')  # pop函数可以把相应的字段从数据集里去除，并返回该字段的值
y_eval = eval_df.pop('survived')

train_df.describe()  #显示统计信息

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


### 数据处理

In [4]:
# 使用feature columns对数据做封装
# 将数据分为离散特征和连续特征
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']   # 离散特征
numeric_columns = ['age', 'fare']   # 连续特征
feature_columns = []
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()   # unique函数获取一个属性所有可能的值
    print(categorical_column, vocab)
    feature_column = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(categorical_column, vocab))
    feature_columns.append(feature_column)

for numeric_column in numeric_columns:
    feature_columns.append(tf.feature_column.numeric_column(numeric_column, dtype=tf.float32))

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']


In [5]:
# 构建dataset
def make_dataset(data_df, label_df, epochs=10, shuffle=True, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset.make_one_shot_iterator().get_next()

train_dataset = make_dataset(train_df, y_train, batch_size=5)

AttributeError: 'BatchDataset' object has no attribute 'make_one_shot_iterator'

## 自定义estimator

In [None]:
output_dir = 'customized_estimator'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
def model_fn(features, labels, mode, params):
    # mode: model runtime stat: [Train, Eval, Predict]
    input_for_next_layer = tf.feature_column.input_layer(features, params['feature_column'])  # 作用与keras.layers.DenseFeatures相同
    for n_unit in params['hidden_units']:
        input_for_next_layer = tf.layers.dense(input_for_next_layer, units=n_unit, activation=tf.nn.relu)
    logits = tf.layer.dense(input_for_next_layer, units=params['n_classes'], activation=None)
    predicted_classes = tf.argmax(logits, 1)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'class_ids': predicted_classes[:, tf.newaxis],   # 这里要求predicted_classes是一个n*1的矩阵，而原本的predicted_classes是一个向量，因此需要扩展一个维度
            'probabilities': tf.nn.softmax(logits), 
            'logits': logits
        }
        return tf.estimator.EstimatorSpec(mode, predictions = predictions) # 这里的predictions定义了session run哪些算子
    
    loss = tf.losses.sparse_categorical_crossentropy(labels = labels, logits = logits)
    
    accuracy = tf.metrics.accuracy(labels = labels, 
                                  predictions = predicted_classes, 
                                  name = 'acc_op')
    metrics = {'accuracy': accuracy}
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss = loss, eval_metric_ops = metrics)
    
    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss, global_step = tf.train.get_global_step())
    if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode, loss = loss, train_op = train_op)

estimator = tf.estimator.Estimator(
    model_fn = model_fn,
    model_dir = output_dir, 
    params = {
        'feature_columns': feature_columns,
        'hidden_units': [100, 100],
        'n_classes': 2
    }
)

estimator.train(input_fn = lambda : make_dataset(train_df, y_train, epochs = 100))