## UCI Adult数据集
### 下载地址: https://archive.ics.uci.edu/ml/datasets/adult
这是一个分类问题，我们要预测一个人的收入是否高于$50K/年，这个数据集也叫做"Census收入"数据集

In [1]:
import shutil
import math
from datetime import datetime
import multiprocessing

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import data
from tensorflow.python.feature_column import feature_column

print(tf.__version__)

1.7.0


In [2]:
MODEL_NAME = 'cenus-model-01'

TRAIN_DATA_FILES_PATTERN = 'adult_train.csv'
TEST_DATA_FILES_PATTERN = 'adult_test.csv'

RESUME_TRAINING = False
PROCESS_FEATURES = True
EXTEND_FEATURE_COLUMNS = True
MULTI_THREADING = True

## 定义数据集的信息

In [3]:
# 数据集每个字段的名称
HEADER = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
               'marital_status', 'occupation', 'relationship', 'race', 'gender',
               'capital_gain', 'capital_loss', 'hours_per_week',
               'native_country', 'income_bracket']

# 数据集默认值(数值型默认0，字符串型默认空串)
HEADER_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
                       [0], [0], [0], [''], ['']]
# 数值型的列
NUMERIC_FEATURE_NAMES = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# 类别型的列，同时把列的不同取值列出来
CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY = {
    'gender': ['Female', 'Male'],
    
    'race': ['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'],
    
    'education': ['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 
                  'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', 
                  '5th-6th', '10th', '1st-4th', 'Preschool', '12th'],
    
    'marital_status': ['Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 
                       'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'],
    
    'relationship': ['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'],
    
    'workclass': ['Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 'Local-gov', '?', 
                  'Self-emp-inc', 'Without-pay', 'Never-worked']
}

# 类别比较多的，我们做hash分桶
CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE = {
    'occupation': 50,
    'native_country' : 100
}

# 类别型的列名
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.keys()) + list(CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.keys())

# 总的列名
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

# 目标列名
TARGET_NAME = 'income_bracket'

# 目标不同类别的取值
TARGET_LABELS = ['<=50K', '>50K']

# 权重列
WEIGHT_COLUMN_NAME = 'fnlwgt'

# 没有用到的列
UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES) - {TARGET_NAME} - {WEIGHT_COLUMN_NAME})


print("全部列名: {}".format(HEADER))
print("数值型的特征: {}".format(NUMERIC_FEATURE_NAMES))
print("类别型的特征: {}".format(CATEGORICAL_FEATURE_NAMES))
print("目标列: {} - 不同的分类结果: {}".format(TARGET_NAME, TARGET_LABELS))
print("没有用到的列: {}".format(UNUSED_FEATURE_NAMES))

全部列名: ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_bracket']
数值型的特征: ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
类别型的特征: ['gender', 'race', 'education', 'marital_status', 'relationship', 'workclass', 'occupation', 'native_country']
目标列: income_bracket - 不同的分类结果: ['<=50K', '>50K']
没有用到的列: []


## 做一点数据探索
这里数据探索还是用的pandas，毕竟数据分析和自带的可视化，用这个会方便一点。

In [4]:
train_data = pd.read_csv(TRAIN_DATA_FILES_PATTERN, header=None, names=HEADER )
train_data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
gender            32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
income_bracket    32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
train_data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
TRAIN_DATA_SIZE = train_data.shape[0]
test_data = pd.read_csv(TEST_DATA_FILES_PATTERN, skiprows=1)
TEST_DATA_SIZE = test_data.shape[0]

### Compute Scaling Statistics for Numeric Columns

In [8]:
means = train_data[NUMERIC_FEATURE_NAMES].mean(axis=0)
stdvs = train_data[NUMERIC_FEATURE_NAMES].std(axis=0)
maxs = train_data[NUMERIC_FEATURE_NAMES].max(axis=0)
mins = train_data[NUMERIC_FEATURE_NAMES].min(axis=0)
df_stats = pd.DataFrame({"mean":means, "stdv":stdvs, "max":maxs, "min":mins})
df_stats.head(15)

Unnamed: 0,max,mean,min,stdv
age,90,38.581647,17,13.640433
education_num,16,10.080679,1,2.57272
capital_gain,99999,1077.648844,0,7385.292085
capital_loss,4356,87.30383,0,402.960219
hours_per_week,99,40.437456,1,12.347429


### 存储统计分析数据

In [9]:
df_stats.to_csv(path_or_buf="adult.stats.csv", header=True, index=True)

## 整体结构

![](http://7xo0y8.com1.z0.glb.clouddn.com/tf/estimator.png?imageView2/2/w/400)

## 定义数据输入函数

### a. 解析csv与预处理逻辑

In [10]:
def parse_csv_row(csv_row):
    # help(tf.decode_csv)
    columns = tf.decode_csv(csv_row, record_defaults=HEADER_DEFAULTS)
    # 把tensor和对应的列名打包成字典
    features = dict(zip(HEADER, columns))
    # 去除无用的列
    for column in UNUSED_FEATURE_NAMES:
        features.pop(column)
    # 取出目标列
    target = features.pop(TARGET_NAME)
    # 返回 字典+target序列形式
    return features, target

# 处理特征
def process_features(features):
    # 判断，字典中新的key capital_indicator也同样对应一个tensor
    capital_indicator = features['capital_gain'] > features['capital_loss']
    features['capital_indicator'] = tf.cast(capital_indicator, dtype=tf.int32)
    # 返回feature字典
    return features

### b. 数据输入函数

In [11]:
# 输入到estimator的数据解析函数
def csv_input_fn(file_names, mode=tf.estimator.ModeKeys.EVAL, 
                 skip_header_lines=0, 
                 num_epochs=None, 
                 batch_size=200):
    
    # 训练阶段数据要shuffle，测试阶段不用
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    # 多线程
    num_threads = multiprocessing.cpu_count() if MULTI_THREADING else 1
    # 输出信息
    print("")
    print("数据输入函数input_fn:")
    print("================")
    print("输入文件: {}".format(file_names))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("模式: {}".format(mode))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")

    
    #file_names = tf.matching_files(files_name_pattern)
    dataset = data.TextLineDataset(filenames=file_names)
    # 跳过第一行
    dataset = dataset.skip(skip_header_lines)
    # 乱序
    if shuffle:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
    # 取一个batch
    dataset = dataset.batch(batch_size)
    # 对数据进行解析
    dataset = dataset.map(lambda csv_row: parse_csv_row(csv_row), 
                          num_parallel_calls=num_threads)
    # 如果做更多处理，添加新列
    if PROCESS_FEATURES:
        dataset = dataset.map(lambda features, target: (process_features(features), target), 
                              num_parallel_calls=num_threads)
    # 每个epoch完成后，重启dataset  
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()
    # 取出满足 特征字典+结果序列 的值
    features, target = iterator.get_next()
    return features, target

In [12]:
features, target = csv_input_fn(file_names=["./adult_train.csv"])
print("CSV文件的特征: {}".format(list(features.keys())))
print("CSV文件的标签: {}".format(target))


数据输入函数input_fn:
输入文件: ['./adult_train.csv']
Batch size: 200
Epoch Count: None
模式: eval
Thread Count: 8
Shuffle: False

CSV文件的特征: ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'capital_indicator']
CSV文件的标签: Tensor("IteratorGetNext:15", shape=(?,), dtype=string)


## 定义特征列
### a. 对数值型变量做幅度缩放

In [13]:
df_stats = pd.read_csv("./adult.stats.csv", header=0, index_col=0)
df_stats['feature_name'] = NUMERIC_FEATURE_NAMES
df_stats.head(10)

Unnamed: 0,max,mean,min,stdv,feature_name
age,90,38.581647,17,13.640433,age
education_num,16,10.080679,1,2.57272,education_num
capital_gain,99999,1077.648844,0,7385.292085,capital_gain
capital_loss,4356,87.30383,0,402.960219,capital_loss
hours_per_week,99,40.437456,1,12.347429,hours_per_week


### b. 构建不同的特征列(特征工程)

In [14]:
# 使用tf构建的高级特征
def extend_feature_columns(feature_columns, hparams):
    
    # 年龄分桶
    age_buckets = tf.feature_column.bucketized_column(
      feature_columns['age'], boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    
    # 特征交叉组合并hash分桶
    education_X_occupation = tf.feature_column.crossed_column(
     ['education', 'occupation'], hash_bucket_size=int(1e4))
    
    # 特征交叉组合并hash分桶
    age_buckets_X_race = tf.feature_column.crossed_column(
     [age_buckets, feature_columns['race']], hash_bucket_size=int(1e4))
    
    # 特征交叉组合并hash分桶
    native_country_X_occupation = tf.feature_column.crossed_column(
          ['native_country', 'occupation'], hash_bucket_size=int(1e4))
    
    # 对类别型特征做embedding
    native_country_embedded = tf.feature_column.embedding_column(
          feature_columns['native_country'], dimension=hparams['embedding_size'])
    
    # 对类别型特征做embedding
    occupation_embedded = tf.feature_column.embedding_column(
          feature_columns['occupation'], dimension=hparams['embedding_size'])
    
    # 同上
    education_X_occupation_embedded = tf.feature_column.embedding_column(
          education_X_occupation, dimension=hparams['embedding_size'])
    
    # 同上
    native_country_X_occupation_embedded = tf.feature_column.embedding_column(
          native_country_X_occupation, dimension=hparams['embedding_size'])
    
    # 构建feature columns
    feature_columns['age_buckets'] = age_buckets
    feature_columns['education_X_occupation'] = education_X_occupation
    feature_columns['age_buckets_X_race'] = age_buckets_X_race
    feature_columns['native_country_X_occupation'] = native_country_X_occupation
    feature_columns['native_country_embedded'] = native_country_embedded
    feature_columns['occupation_embedded'] = occupation_embedded
    feature_columns['education_X_occupation_embedded'] = education_X_occupation_embedded
    feature_columns['native_country_X_occupation_embedded'] = native_country_X_occupation_embedded
    
    # 返回feature_columns字典
    return feature_columns

# 标准化
def standard_scaler(x, mean, stdv):
    return (x-mean)/(stdv)

# 最大最小值幅度缩放
def maxmin_scaler(x, max_value, min_value):
    return (x-min_value)/(max_value-min_value)  

# 全部的特征
def get_feature_columns(hparams):
    
    # 数值型的列
    numeric_columns = {}
    # 对数值型的列做幅度缩放(scaling)
    for feature_name in NUMERIC_FEATURE_NAMES:

        feature_mean = df_stats[df_stats.feature_name == feature_name]['mean'].values[0]
        feature_stdv = df_stats[df_stats.feature_name == feature_name]['stdv'].values[0]
        normalizer_fn = lambda x: standard_scaler(x, feature_mean, feature_stdv)
        
        numeric_columns[feature_name] = tf.feature_column.numeric_column(feature_name, 
                                                                         normalizer_fn=normalizer_fn)
    # 新构建列(这里没有)                                                                  
    CONSTRUCTED_NUMERIC_FEATURES_NAMES = []
    
    if PROCESS_FEATURES:
        for feature_name in CONSTRUCTED_NUMERIC_FEATURES_NAMES:
            numeric_columns[feature_name] = tf.feature_column.numeric_column(feature_name)
    
    # 对类别型的列做独热向量编码
    categorical_column_with_vocabulary = \
        {item[0]: tf.feature_column.categorical_column_with_vocabulary_list(item[0], item[1])
         for item in CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.items()}
    
    # indicator列，multi-hot编码
    CONSTRUCTED_INDICATOR_FEATURES_NAMES = ['capital_indicator']
    
    categorical_column_with_identity = {}
    
    for feature_name in CONSTRUCTED_INDICATOR_FEATURES_NAMES: 
        categorical_column_with_identity[feature_name] = tf.feature_column.categorical_column_with_identity(feature_name, 
                                                                                                              num_buckets=2,
                                                                                                            default_value=0)
    # 类别型进行hash分桶映射                                                                                                          
    categorical_column_with_hash_bucket = \
        {item[0]: tf.feature_column.categorical_column_with_hash_bucket(item[0], item[1], dtype=tf.string)
         for item in CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.items()}
        
    feature_columns = {}

    # 更新数值列                                                                                                        
    if numeric_columns is not None:
        feature_columns.update(numeric_columns)

    # 更新独热向量编码列
    if categorical_column_with_vocabulary is not None:
        feature_columns.update(categorical_column_with_vocabulary)
    
    # 更新label encoder列
    if categorical_column_with_identity is not None:
        feature_columns.update(categorical_column_with_identity)
                                                                                                            
    # 更新类别型hash分桶列    
    if categorical_column_with_hash_bucket is not None:
        feature_columns.update(categorical_column_with_hash_bucket)
                                                                                                            
    # 扩充tf产出的高级列
    if EXTEND_FEATURE_COLUMNS:
        feature_columns = extend_feature_columns(feature_columns, hparams)
                                                                                                            
    # 返回feature columns   
    return feature_columns

feature_columns = get_feature_columns(hparams={"num_buckets":5,"embedding_size":3})
print("Feature Columns: {}".format(feature_columns))

Feature Columns: {'age': _NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x10243c0d0>), 'education_num': _NumericColumn(key='education_num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x10243c048>), 'capital_gain': _NumericColumn(key='capital_gain', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x10243cbf8>), 'capital_loss': _NumericColumn(key='capital_loss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x10243c2f0>), 'hours_per_week': _NumericColumn(key='hours_per_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x10243c510>), 'gender': _VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Female', 'Male

## 定义一个DNN  Estimator

### a. 获取宽度和深度的特征列

In [15]:
# 模型当中需要的宽度和深度特征
def get_wide_deep_columns():
    # 所有列名
    feature_columns = list(get_feature_columns(hparams).values())
    # 过滤出深度部分的特征
    dense_columns = list(
        filter(lambda column: isinstance(column, feature_column._NumericColumn) |
                              isinstance(column, feature_column._EmbeddingColumn),
               feature_columns
        )
    )
    # 过滤出类别型的特征
    categorical_columns = list(
        filter(lambda column: isinstance(column, feature_column._VocabularyListCategoricalColumn) |
                              isinstance(column, feature_column._IdentityCategoricalColumn) |
                              isinstance(column, feature_column._BucketizedColumn),
                   feature_columns)
    )
    # 稀疏特征(也是在wide部分的)
    sparse_columns = list(
        filter(lambda column: isinstance(column,feature_column._HashedCategoricalColumn) |
                              isinstance(column, feature_column._CrossedColumn),
               feature_columns)
    )
    # 指示列特征
    indicator_columns = list(
            map(lambda column: tf.feature_column.indicator_column(column),
                categorical_columns)
    )
    # 明确deep和wide部分需要的特征列
    deep_feature_columns = dense_columns + indicator_columns
    wide_feature_columns = categorical_columns + sparse_columns
    
    # 返回deep和wide部分的特征列
    return wide_feature_columns, deep_feature_columns

### b. 定义estimator

In [16]:
def create_DNNComb_estimator(run_config, hparams, print_desc=False):
    
    # 取到返回的特征列
    wide_feature_columns, deep_feature_columns = get_wide_deep_columns()
    
    # 构建宽度深度模型
    estimator = tf.estimator.DNNLinearCombinedClassifier(
        
        # 指定分类类别的个数
        n_classes=len(TARGET_LABELS),
        # 如果类别不是从0到n-1的n个连续整数，则需要指定不同类别(用一个list)
        label_vocabulary=TARGET_LABELS,
        
        # 定义宽度和深度列
        dnn_feature_columns = deep_feature_columns,
        linear_feature_columns = wide_feature_columns,
        
        # 定义样本权重列
        weight_column=WEIGHT_COLUMN_NAME,
        
        # 关于DNN隐层的一些设定
        dnn_hidden_units= hparams["hidden_units"],
        # 优化器的选择
        dnn_optimizer= tf.train.AdamOptimizer(),
        # 激活函数的选择
        dnn_activation_fn= tf.nn.relu,
        
        # 配置
        config= run_config
    )
    
    
    if print_desc:
        print("")
        print("预估器类型:")
        print("================")
        print(type(estimator))
        print("")
        print("深度部分的列名:")
        print("==============")
        print(deep_feature_columns)
        print("")
        print("宽度部分的列名:")
        print("=============")
        print(wide_feature_columns)
        print("")
    
    return estimator

## 6. 构建实验

### a. 设定参数与运行参数

In [17]:
TRAIN_SIZE = TRAIN_DATA_SIZE
NUM_EPOCHS = 100
BATCH_SIZE = 500
EVAL_AFTER_SEC = 60
TOTAL_STEPS = (TRAIN_SIZE/BATCH_SIZE)*NUM_EPOCHS

hparams  = {
    "num_epochs" : NUM_EPOCHS,
    "batch_size" : BATCH_SIZE,
    "embedding_size" : 4,
    "hidden_units" : [64, 32, 16],
    "max_steps" : TOTAL_STEPS}

model_dir = 'trained_models/{}'.format(MODEL_NAME)

run_config = tf.estimator.RunConfig(
    log_step_count_steps=5000,
    tf_random_seed=201805,
    model_dir=model_dir
)

print(hparams)
print("模型目录:", run_config.model_dir)
print("")
print("数据集大小:", TRAIN_SIZE)
print("Batch大小:", BATCH_SIZE)
print("每个Epoch的迭代次数:",TRAIN_SIZE/BATCH_SIZE)
print("总迭代次数:", TOTAL_STEPS)

{'num_epochs': 100, 'batch_size': 500, 'embedding_size': 4, 'hidden_units': [64, 32, 16], 'max_steps': 6512.2}
模型目录: trained_models/cenus-model-01

数据集大小: 32561
Batch大小: 500
每个Epoch的迭代次数: 65.122
总迭代次数: 6512.2


### b. 定义train_and_eval 需要的配置TrainSpec和EvaluSpec

In [18]:
train_spec = tf.estimator.TrainSpec(
    # 给的第一个是input_fn
    input_fn = lambda: csv_input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode = tf.estimator.ModeKeys.TRAIN,
        num_epochs=hparams["num_epochs"],
        batch_size=hparams["batch_size"]
    ),
    # 最大迭代次数
    max_steps=hparams["max_steps"],
    # 可以附加一些其他功能
    hooks=None
)

eval_spec = tf.estimator.EvalSpec(
    input_fn = lambda: csv_input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode=tf.estimator.ModeKeys.EVAL,
        num_epochs=1,
        batch_size=hparams["batch_size"],
            
    ),
    throttle_secs = EVAL_AFTER_SEC,
    steps=None
)

### c. 通过train_and_evaluate跑实验

In [19]:
if not RESUME_TRAINING:
    print("清除之前训练的结果...")
    shutil.rmtree(model_dir, ignore_errors=True)
else:
    print("从上一次训练重新加载继续训练...") 

    
tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.now() 
print("训练与验证开始于{}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

estimator = create_DNNComb_estimator(run_config, hparams, True)

tf.estimator.train_and_evaluate(
    estimator=estimator,
    train_spec=train_spec, 
    eval_spec=eval_spec
)

time_end = datetime.now() 
print(".......................................")
print("训练与验证结束于{}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("训练和验证实验总耗时{}秒".format(time_elapsed.total_seconds()))
    

清除之前训练的结果...
训练与验证开始于10:45:58
.......................................
INFO:tensorflow:Using config: {'_model_dir': 'trained_models/cenus-model-01', '_tf_random_seed': 201805, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1023d7400>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

预估器类型:
<class 'tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier'>

深度部分的列名:
[_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x1023ba6a8>), _NumericColumn(key='education_num', shape=(1,), defaul

INFO:tensorflow:loss = 27724134.0, step = 201 (0.505 sec)
INFO:tensorflow:loss = 26221192.0, step = 301 (0.500 sec)
INFO:tensorflow:loss = 30562312.0, step = 401 (0.500 sec)
INFO:tensorflow:loss = 26799344.0, step = 501 (0.493 sec)
INFO:tensorflow:loss = 28504848.0, step = 601 (0.489 sec)
INFO:tensorflow:loss = 31572938.0, step = 701 (0.495 sec)
INFO:tensorflow:loss = 34452524.0, step = 801 (0.493 sec)
INFO:tensorflow:loss = 24246322.0, step = 901 (0.518 sec)
INFO:tensorflow:loss = 24602954.0, step = 1001 (0.562 sec)
INFO:tensorflow:loss = 29845566.0, step = 1101 (0.509 sec)
INFO:tensorflow:loss = 26672250.0, step = 1201 (0.510 sec)
INFO:tensorflow:loss = 27849432.0, step = 1301 (0.512 sec)
INFO:tensorflow:loss = 27291776.0, step = 1401 (0.527 sec)
INFO:tensorflow:loss = 24043900.0, step = 1501 (0.555 sec)
INFO:tensorflow:loss = 30703954.0, step = 1601 (0.553 sec)
INFO:tensorflow:loss = 27262016.0, step = 1701 (0.515 sec)
INFO:tensorflow:loss = 26068582.0, step = 1801 (0.516 sec)
INFO:

## 评估模型

In [20]:
TRAIN_SIZE = TRAIN_DATA_SIZE
TEST_SIZE = TEST_DATA_SIZE

train_input_fn = lambda: csv_input_fn(file_names= TRAIN_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TRAIN_SIZE)

test_input_fn = lambda: csv_input_fn(file_names= TEST_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TEST_SIZE)

estimator = create_DNNComb_estimator(run_config, hparams)

train_results = estimator.evaluate(input_fn=train_input_fn, steps=1)
print()
print("######################################################################################")
print("# 训练结果指标: {}".format(train_results))
print("######################################################################################")

test_results = estimator.evaluate(input_fn=test_input_fn, steps=1)
print()
print("######################################################################################")
print("# 测试结果指标: {}".format(test_results))
print("######################################################################################")

INFO:tensorflow:Using config: {'_model_dir': 'trained_models/cenus-model-01', '_tf_random_seed': 201805, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1023d7400>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

数据输入函数input_fn:
输入文件: adult_train.csv
Batch size: 32561
Epoch Count: None
模式: eval
Thread Count: 8
Shuffle: False

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-05-18-02:46:46
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-01/model.ckpt-6513
INFO:tensorflow:Run

## 预测

In [21]:
import itertools

# 输入
predict_input_fn = lambda: csv_input_fn(TEST_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.PREDICT,
                                      batch_size= 10)


predictions = list(itertools.islice(estimator.predict(input_fn=predict_input_fn),10))

print("")
print("预测的类别: {}".format(list(map(lambda item: item["class_ids"][0]
    ,predictions))))

print("预测概率为: {}".format(list(map(lambda item: list(item["probabilities"])
    ,predictions))))


数据输入函数input_fn:
输入文件: adult_test.csv
Batch size: 10
Epoch Count: None
模式: infer
Thread Count: 8
Shuffle: False

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-01/model.ckpt-6513
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

预测的类别: [0, 0, 0, 1, 0, 0, 0, 1, 0, 0]
预测概率为: [[0.99999297, 6.992759e-06], [0.9547619, 0.045238085], [0.86959815, 0.13040185], [0.21073815, 0.7892619], [0.9999995, 5.3270975e-07], [0.99999905, 9.3406624e-07], [0.99990857, 9.1373935e-05], [0.42117104, 0.57882893], [0.9976412, 0.0023588594], [0.9993327, 0.00066724705]]
