# 基于Width & Depth Model的人群收入分类预测
标签为收入分类，特征包括年龄、工作类型（公立私立）、教育程度（本科、硕士）、受教育年份、婚姻状况、家庭关系、人种、性别、每周工作时长...

## 1、引入工具库

In [1]:
# 高级文件处理模块
import shutil
import math
from datetime import datetime
# 多进程模块
import multiprocessing

import pandas as pd 
import numpy as np

import tensorflow as tf 
from tensorflow import data
from tensorflow.python.feature_column import feature_column

print(tf.__version__)

1.11.0


## 2、基本设定与数据读写

In [2]:
MODEL_NAME = 'D&W Model'
# 训练集、测试集文件名
TRAIN_DATA_FILES_PATTERN = 'adult_train.csv'
TEST_DATA_FILES_PATTERN = 'adult_test.csv'

# 一些开关设置    恢复训练 特征处理 扩展特征 多线程
RESUME_TRAINING = False
PROCESS_FEATURES = True
EXTEND_FEATURE_COLUMNS = True
MULTI_THREADING = True

In [3]:
data = pd.read_csv('adult_train.csv',header = None)
data.columns =['age','workclass','fnlwgt','education','education_num','marital_status',
              'occupation','relationship','race','gender','capital_gain',
              'capital_loss','hours_per_week','native_country','income_bracket']

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## 3、定义数据集的信息

In [5]:
# 数据集每个字段的名称
HEADER = ['age','workclass','fnlwgt','education','education_num','marital_status',
              'occupation','relationship','race','gender','capital_gain',
              'capital_loss','hours_per_week','native_country','income_bracket']
# 数据集默认值（数值型默认0，字符串型默认空串）
HEADER_DEFAULTS = [[0],[''],[0],[''],[0],[''],[''],[''],
                   [''],[''],[0],[0],[0],[''],['']]
# I 数值型的列
NUMERIC_FEATURE_NAMES = ['age','education_num','capital_gain','capital_loss',
                         'hours_per_week']
# II 类别型的列，同时把列的不同值列出来
# print(list(data['workclass'].unique()))
CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY = {
    'gender':list(data['gender'].unique()),
    'race':list(data['race'].unique()),
    'education':list(data['education'].unique()),
    'marital_status':list(data['marital_status'].unique()),
    'relationship':list(data['relationship'].unique()),
    'workclass':list(data['workclass'].unique())
}
print(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY)

# III 分桶列，类别比较多时使用
CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE = {
    'occupation':50,
    'native_country':100
}

# 类别型列名list
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.keys())+list(CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.keys())
    
# 总的列名list
FEATURE_NAMES  = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

# 目标列名
TARGET_NAME = 'income_bracket'

# 目标不同类别的取值
TARGET_LABELS = ['<=50K','>50K']

# 权重列
WEIGHT_COLUMN_NAME = 'fnlwgt'

# 没有用到的列
UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES) - 
                            {TARGET_NAME}-{WEIGHT_COLUMN_NAME})

print('全部列名：{}'.format(HEADER))
print('数值型的特征:{}'.format(NUMERIC_FEATURE_NAMES))
print('类别型的特征:{}'.format(CATEGORICAL_FEATURE_NAMES))
print('目标列：{}-不同的分类结果：{}'.format(TARGET_NAME,TARGET_LABELS))
print('没有用到的列:{}'.format(UNUSED_FEATURE_NAMES))

{'gender': ['Male', 'Female'], 'workclass': ['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'], 'relationship': ['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'], 'marital_status': ['Never-married', 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Separated', 'Married-AF-spouse', 'Widowed'], 'race': ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other'], 'education': ['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th']}
全部列名：['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_bracket']
数值型的特征:['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
类别型的特征:['gen

## 4、做一些数据探索

In [6]:
train_data = pd.read_csv(TRAIN_DATA_FILES_PATTERN,header=None,names=HEADER)
train_data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
gender            32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
income_bracket    32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [8]:
train_data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [9]:
TRAIN_DATA_SIZE = train_data.shape[0]
test_data = pd.read_csv(TEST_DATA_FILES_PATTERN,skiprows=1)
TEST_DATA_SIZE = test_data.shape[0]

#### 计算数值列的统计指标用于变换

In [10]:
# 方便接下来做幅度缩放
means = train_data[NUMERIC_FEATURE_NAMES].mean(axis=0)
stdvs = train_data[NUMERIC_FEATURE_NAMES].std(axis=0)
maxs = train_data[NUMERIC_FEATURE_NAMES].max(axis=0)
mins = train_data[NUMERIC_FEATURE_NAMES].min(axis=0)
df_stats = pd.DataFrame({'mean':means,'stdv':stdvs,'max':maxs,'min':mins})
df_stats.head(15)

Unnamed: 0,max,mean,min,stdv
age,90,38.581647,17,13.640433
education_num,16,10.080679,1,2.57272
capital_gain,99999,1077.648844,0,7385.292085
capital_loss,4356,87.30383,0,402.960219
hours_per_week,99,40.437456,1,12.347429


#### 存储统计分析数据

In [11]:
df_stats.to_csv(path_or_buf = 'adult.stats.csv',header=True,index=True)

##  5、定义数据输入函数

###  a、解析csv与预处理逻辑

In [12]:
def parse_csv_row(csv_row):
    columns = tf.decode_csv(csv_row,record_defaults=HEADER_DEFAULTS)
    # 把tensor和对应的列名打包成字典
    features = dict(zip(HEADER,columns))
    # 取出无用的列
    for colum in UNUSED_FEATURE_NAMES:
        features.pop(column)
    # 取出目标列
    target = features.pop(TARGET_NAME)
    # 返回 字典+target序列形式
    return features,target

# 处理特征 butai ok
def process_features(features):
    # 判断，字典中新的key capital_indicator也同样对应一个tensor
    capital_indicator = features['capital_gain'] > features['capital_loss']
    features['capital_indicator'] = tf.cast(capital_indicator, dtype=tf.int32)
    # 返回feature字典
    return features

### b、数据输入函数

In [13]:
# 输入到estimator的数据解析函数
def csv_input_fn(file_names,mode=tf.estimator.ModeKeys.EVAL,
                skip_header_lines=0,
                num_epochs=None,
                batch_size=200):
    # 训练阶段用shuffle
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    # 多线程
    num_threads = multiprocessing.cpu_count() if MULTI_THREADING else 1
    # 输出信息
    print('\n')
    print('数据输入函数input_fn:')
    print('====================')
    print('输入文件：{}'.format(file_names))
    print('一批大小：{}'.format(batch_size))
    print('迭代的轮次：{}'.format(num_epochs))
    print('模式：{}'.format(mode))
    print('Thread Count:{}'.format(num_threads))
    print('Shuffle:{}'.format(shuffle))
    print('===================')
    
    dataset = tf.data.TextLineDataset(filenames=file_names)
    dataset = dataset.skip(skip_header_lines)
    # 乱序
    if shuffle:
        dataset  = dataset.shuffle(buffer_size = 2 * batch_size + 1)
    # 取一个batch    
    dataset = dataset.batch(batch_size)
    # 对数据进行解析
    dataset = dataset.map(lambda csv_row:parse_csv_row(csv_row),num_parallel_calls = num_threads)
    print('/n',dataset)
    # 如果需要多处理
    if PROCESS_FEATURES:
        dataset  = dataset.map(lambda features,target:(process_features(features),
                                            target),num_parallel_calls = num_threads)
    # 每个轮次完成后，重启dataset
    dataset  = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()
    # 取出满足 特征字典+ 结果标签的值
    features,target = iterator.get_next()
    return features,target

In [14]:
features,target = csv_input_fn(file_names = ['adult_train.csv'])
print('csv文件的特征：{}'.format(list(features.keys())))
print('csv文件的标签：{}'.format(target))



数据输入函数input_fn:
输入文件：['adult_train.csv']
一批大小：200
迭代的轮次：None
模式：eval
Thread Count:4
Shuffle:False
/n <ParallelMapDataset shapes: ({relationship: (?,), gender: (?,), marital_status: (?,), native_country: (?,), education_num: (?,), education: (?,), fnlwgt: (?,), workclass: (?,), capital_loss: (?,), occupation: (?,), hours_per_week: (?,), capital_gain: (?,), race: (?,), age: (?,)}, (?,)), types: ({relationship: tf.string, gender: tf.string, marital_status: tf.string, native_country: tf.string, education_num: tf.int32, education: tf.string, fnlwgt: tf.int32, workclass: tf.string, capital_loss: tf.int32, occupation: tf.string, hours_per_week: tf.int32, capital_gain: tf.int32, race: tf.string, age: tf.int32}, tf.string)>
csv文件的特征：['relationship', 'gender', 'marital_status', 'capital_loss', 'native_country', 'education_num', 'education', 'fnlwgt', 'workclass', 'capital_indicator', 'occupation', 'hours_per_week', 'capital_gain', 'race', 'age']
csv文件的标签：Tensor("IteratorGetNext:15", shape=(?,)

### 6、定义特征列

#### a、对数值型变量做福都缩放

In [15]:
df_stats = pd.read_csv('adult.stats.csv',header=0,index_col=0)
# df_stats.head(10)
df_stats['feature_name'] = NUMERIC_FEATURE_NAMES
df_stats.head(10)

Unnamed: 0,max,mean,min,stdv,feature_name
age,90,38.581647,17,13.640433,age
education_num,16,10.080679,1,2.57272,education_num
capital_gain,99999,1077.648844,0,7385.292085,capital_gain
capital_loss,4356,87.30383,0,402.960219,capital_loss
hours_per_week,99,40.437456,1,12.347429,hours_per_week


#### b、构建不同的特征列（特征工程）

In [16]:
# 使用tf构建 高级特征
def extend_feature_columns(feature_columns,hparams):
    # 分桶：年龄
    age_buckets = tf.feature_column.bucketized_column(
        feature_columns['age'],boundaries=[18,25,30,35,40,45,50,55,60,65])
    # 特征交叉组合并hash分桶1:教育与职业
    education_x_occupation  = tf.feature_column.crossed_column(
        ['education','occupation'],hash_bucket_size = int(1e4))
    # 特征交叉组合并hash分桶：年龄与种族
    age_buckets_x_race  =  tf.feature_column.crossed_column(
        [age_buckets,feature_columns['race']],hash_bucket_size= int(1e4))
    # 特征交叉组合并hash分桶：国家与职业
    native_country_x_occupation = tf.feature_column.crossed_column(
        ['native_country','occupation'],hash_bucket_size = int(1e4))
    # 对类别类型做embedding:国家
    native_country_embedded = tf.feature_column.embedding_column(
        feature_columns['native_country'],dimension = hparams['embedding_size']
    )
    # 对类别类型做embedding:职业
    occupation_embedded = tf.feature_column.embedding_column(
        feature_columns['occupation'],dimension = hparams['embedding_size']
    )
    # 对交叉组合并分桶过后的进行embedding
    education_x_occupation_embedded = tf.feature_column.embedding_column(
        education_x_occupation,dimension=hparams['embedding_size']
    )
    native_country_x_occupation_embedded  = tf.feature_column.embedding_column(
        native_country_x_occupation,dimension = hparams['embedding_size']
    )
    # 构建feature columns
    feature_columns['age_buckets'] = age_buckets
    feature_columns['education_x_occupation']= education_x_occupation
    feature_columns['age_buckets_x_race'] = age_buckets_x_race
    feature_columns['native_country_x_occupation'] = native_country_x_occupation
    feature_columns['native_country_embedded'] = native_country_embedded
    feature_columns['occupation_embedded'] = occupation_embedded
    feature_columns['education_x_occupatio_embedded'] = education_x_occupation_embedded
    feature_columns['native_country_x_occupation_embedded'] = native_country_x_occupation_embedded
    #返回 feature_column字典
    return feature_columns
# 标准化
def standard_scaler(x,mean,stdv):
    return (x-mean)/(stdv)
# 最大最小值幅度缩放
def maxmin_scaler(x,max_value,min_value):
    return (x-min_value)/(max_value-min_value)
# 全部特征
def get_feature_columns(hparams):
    # 数值型的列
    numeric_columns = {}
    # 对数值型的列做幅度缩放（scaling）
    for feature_name in NUMERIC_FEATURE_NAMES:
        feature_mean = df_stats[df_stats.feature_name == feature_name]['mean'].values[0]
        feature_stdv = df_stats[df_stats.feature_name == feature_name]['stdv'].values[0]
        normalizer_fn = lambda x:standard_scaler(x,feature_mean,feature_stdv)
        numeric_columns[feature_name] = tf.feature_column.numeric_column(feature_name,
                                                        normalizer_fn = normalizer_fn)
    # 新构建列（这里没有）
    CONSTRUCTED_NUMERIC_FEATURES_NAMES = []
    
    if PROCESS_FEATURES:
        for feature_name in CONSTRUCTED_NUMERIC_FEATURES_NAMES:
            numeric_columns[feature_name] = tf.feature_column.numeric_column(feature_name)
        
    
    # 对类别型的列做独热向量编码 k、v
    categorical_column_with_vocabulary =\
    { item[0]:tf.feature_column.categorical_column_with_vocabulary_list(item[0],item[1])
      for item in CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.items()     
    }
    # indicator列 multi-hot编码
    CONSTRUCTED_INDICATOR_FEATURES_NAMES = ['capital_indicator']
    
    categorical_column_with_identity = {}
    
    for feature_name in CONSTRUCTED_INDICATOR_FEATURES_NAMES:
        categorical_column_with_identity[feature_name] = tf.feature_column.categorical_column_with_identity(
        feature_name,num_buckets=2,default_value=0)
    # 类别性进行hash分桶映射
    categorical_column_with_hash_bucket = \
        {item[0]:tf.feature_column.categorical_column_with_hash_bucket(item[0],item[1],
                    dtype=tf.string)for item in CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.items()
        }
    feature_columns= {}
    # 更新数值列
    if numeric_columns is not None:
        feature_columns.update(numeric_columns)
    
    # 更新独热向量编码列
    if categorical_column_with_vocabulary is not None:
        feature_columns.update(categorical_column_with_vocabulary)
    
    # 更新label encoder列
    if categorical_column_with_identity is not None:
        feature_columns.update(categorical_column_with_identity)
    
    # 更新类别性hash分桶列
    if categorical_column_with_hash_bucket is not None:
        feature_columns.update(categorical_column_with_hash_bucket)
    
    # 扩充tf产出的高级列
    if EXTEND_FEATURE_COLUMNS:
        feature_columns = extend_feature_columns(feature_columns,hparams)
    # 返回feature columns
    return feature_columns

feature_columns = get_feature_columns(hparams={"num_buckets":5,"embedding_size":3})
print('Feature Columns:{}'.format(feature_columns))        
    

Feature Columns:{'native_country_x_occupation_embedded': _EmbeddingColumn(categorical_column=_CrossedColumn(keys=('native_country', 'occupation'), hash_bucket_size=10000, hash_key=None), dimension=3, combiner='mean', layer_creator=<function embedding_column.<locals>._creator at 0x7f56d108abf8>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True), 'marital_status': _VocabularyListCategoricalColumn(key='marital_status', vocabulary_list=('Never-married', 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Separated', 'Married-AF-spouse', 'Widowed'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'education_num': _NumericColumn(key='education_num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x7f56d108a9d8>), 'hours_per_week': _NumericColumn(key='hours_per_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at

### 7、定义一个DNN Estimator

#### a、获取宽度和深度的特征列

In [17]:
# 模型当中需要的宽度和深度特征
def get_wide_deep_columns():
    # 取出所有列名
    feature_columns = list(get_feature_columns(hparams).values())
    # 过滤出深度部分的特征
    dense_columns = list(
        filter(lambda column:isinstance(column,feature_column._NumericColumn)
              | isinstance(column,feature_column._EmbeddingColumn),
               feature_columns
              )
    )
    # 过滤类别型的特征
    categorical_columns = list(
        filter(lambda column:isinstance(column,feature_column._VocabularyListCategoricalColumn)
              | isinstance(column,feature_column._IdentityCategoricalColumn)
              | isinstance(column,feature_column._BucketizedColumn),
               feature_columns
              )
    )
    # 稀疏特征（wide）
    sparse_columns = list(
        filter(lambda column:isinstance(column,feature_column._HashedCategoricalColumn)
              | isinstance(column,feature_column._CrossedColumn),
               feature_columns
              )
    )
    # 指示器特征
    indicator_columns = list(
        map(lambda column: tf.feature_column.indicator_column(column),
            categorical_columns)
    )
    # 明确deep和wide部分需要的特征列
    deep_feature_columns = dense_columns + indicator_columns
    wide_feature_columns = categorical_columns + sparse_columns
    
    # 返回deep和wide部分的特征列
    return wide_feature_columns,deep_feature_columns

#### b、定义estimator

In [18]:
def create_DNNComb_estimator(run_config,hparams,print_desc=False):
    # 取得返回的特征列
    wide_feature_columns,deep_feature_columns = get_wide_deep_columns()
    
    # 构建宽度深度模型
    estimator = tf.estimator.DNNLinearCombinedClassifier(
        # 指定分类类别的个数
        n_classes = len(TARGET_LABELS),
        # 如果类别不是0 dao n-1 的n个连续的整数，则需要指定一下一个list
        label_vocabulary = TARGET_LABELS,
        
        # 定义宽度和深度列
        dnn_feature_columns = deep_feature_columns,
        linear_feature_columns = wide_feature_columns,
        # 定义样本权重列
        weight_column = WEIGHT_COLUMN_NAME,
        # 关于DNN隐层的一些设定
        dnn_hidden_units = hparams["hidden_units"],
        # 优化器选择
        dnn_optimizer = tf.train.AdamOptimizer(),
        # 激活函数的选择
        dnn_activation_fn = tf.nn.relu,
        # 配置
        config = run_config
    )
    
    if print_desc:
        print("")
        print("预估器类型：")
        print("================")
        print(type(estimator))
        print("")
        print("深度部分的列名：")
        print("================")
        print(deep_feature_columns)
        print("")
        print("宽度部分的列名：")
        print("================")
        print(wide_feature_columns)
        print("")
    return estimator

### 8、训练模型

#### a、设定参数与运行参数

In [19]:
TRAIN_SIZE = TRAIN_DATA_SIZE
NUM_EPOCHS = 100
BATCH_SIZE = 500
EVAL_AFTER_SEC = 60
TOTAL_STEPS =  (TRAIN_SIZE/BATCH_SIZE)*NUM_EPOCHS

hparams = {
    'num_epochs':NUM_EPOCHS,
    'batch_size':BATCH_SIZE,
    'embedding_size':4,
    'hidden_units':[64,32,16],
    'max_steps' : TOTAL_STEPS
}
model_dir = '{}'.format(MODEL_NAME)

run_config = tf.estimator.RunConfig(
    log_step_count_steps = 5000,
    tf_random_seed = 201805,
    model_dir = model_dir
)

print(hparams)
print('模型目录：',run_config.model_dir)
print('')
print('数据集大小：',TRAIN_SIZE)
print('Batch大小：',BATCH_SIZE)
print('每个轮次迭代的次数',TRAIN_SIZE/BATCH_SIZE)
print("总迭代次数：",TOTAL_STEPS)


{'num_epochs': 100, 'hidden_units': [64, 32, 16], 'max_steps': 6512.2, 'batch_size': 500, 'embedding_size': 4}
模型目录： D&W Model

数据集大小： 32561
Batch大小： 500
每个轮次迭代的次数 65.122
总迭代次数： 6512.2


###  b、定义train_and_eval 需要的配置TrainSpec和Evalue Spec 规范

In [20]:
train_spec = tf.estimator.TrainSpec(
    input_fn = lambda:csv_input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode = tf.estimator.ModeKeys.TRAIN,
        num_epochs = hparams['num_epochs'],
        batch_size = hparams['batch_size']
    ),
    # 最大迭代次数
    max_steps = hparams['max_steps'],
    # 其他功能，暂无
    hooks = None
)

eval_spec = tf.estimator.EvalSpec(
        input_fn = lambda:csv_input_fn(
            TRAIN_DATA_FILES_PATTERN,
            mode = tf.estimator.ModeKeys.EVAL,
            num_epochs = 1,
            batch_size = hparams['batch_size'],
        ),
        throttle_secs = EVAL_AFTER_SEC,
        steps = None
)

#### c、通过train_and_evaluate跑实验

In [22]:
if not RESUME_TRAINING:
    print('终止之前的训练以及结果...清除文件')
    shutil.rmtree(model_dir,ignore_errors=True)
else :
    print('回复，重新加载上次训练，继续进行...')

tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.now()
print('训练与验证开始于{}'.format(time_start.strftime("%H:%M:%S")))
print('........')

estimator = create_DNNComb_estimator(run_config,hparams,True)

tf.estimator.train_and_evaluate(
    estimator = estimator,
    train_spec = train_spec,
    eval_spec = eval_spec
)

time_end = datetime.now()

print("..........................")
print('训练与验证结束于{}'.format(time_end.strftime('%H:%M:%S')))
print('')
time_elapsed = time_end - time_start
print('训练和验证实验耗时{}秒'.format(time_elapsed.total_seconds()))

终止之前的训练以及结果...清除文件
训练与验证开始于10:19:41
........
INFO:tensorflow:Using config: {'_keep_checkpoint_every_n_hours': 10000, '_service': None, '_master': '', '_protocol': None, '_model_dir': 'D&W Model', '_device_fn': None, '_evaluation_master': '', '_task_id': 0, '_global_id_in_cluster': 0, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_log_step_count_steps': 5000, '_save_summary_steps': 100, '_experimental_distribute': None, '_train_distribute': None, '_num_worker_replicas': 1, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_eval_distribute': None, '_tf_random_seed': 201805, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f566da02748>}

预估器类型：
<class 'tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier'>

深度部分的列名：
[_EmbeddingColumn(categoric

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into D&W Model/model.ckpt.
INFO:tensorflow:loss = 126933144.0, step = 0
INFO:tensorflow:global_step/sec: 35.3353
INFO:tensorflow:loss = 26500464.0, step = 5000 (141.503 sec)
INFO:tensorflow:Saving checkpoints for 6513 into D&W Model/model.ckpt.


数据输入函数input_fn:
输入文件：adult_train.csv
一批大小：500
迭代的轮次：1
模式：eval
Thread Count:4
Shuffle:False
/n <ParallelMapDataset shapes: ({relationship: (?,), gender: (?,), marital_status: (?,), native_country: (?,), education_num: (?,), education: (?,), fnlwgt: (?,), workclass: (?,), capital_loss: (?,), occupation: (?,), hours_per_week: (?,), capital_gain: (?,), race: (?,), age: (?,)}, (?,)), types: ({relationship: tf.string, gender: tf.string, marital_status: tf.string, native_countr

### 9、评估模型

In [26]:
TRAIN_SIZE = TRAIN_DATA_SIZE
TEST_SIZE = TEST_DATA_SIZE

train_input_fn = lambda:csv_input_fn(file_names = TRAIN_DATA_FILES_PATTERN,
                                    mode = tf.estimator.ModeKeys.EVAL,
                                    batch_size = TRAIN_SIZE)
test_input_fn = lambda: csv_input_fn(file_names = TEST_DATA_FILES_PATTERN,
                                    mode = tf.estimator.ModeKeys.EVAL,
                                    batch_size = TEST_SIZE)

estimator = create_DNNComb_estimator(run_config,hparams)

train_results = estimator.evaluate(input_fn = train_input_fn,steps=1)
print("\n")
print("#################################")
print('#训练结果指标：{}'.format(train_results))
print('######################################')

test_results = estimator.evaluate(input_fn=test_input_fn,steps=1)
print('')
print("####################################")

test_results = estimator.evaluate(input_fn = test_input_fn,steps =1)
print('')
print('##########################################')
print('#测试结果指标：{}'.format(test_results))
print('##########################################')

INFO:tensorflow:Using config: {'_keep_checkpoint_every_n_hours': 10000, '_service': None, '_master': '', '_protocol': None, '_model_dir': 'D&W Model', '_device_fn': None, '_evaluation_master': '', '_task_id': 0, '_global_id_in_cluster': 0, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_log_step_count_steps': 5000, '_save_summary_steps': 100, '_experimental_distribute': None, '_train_distribute': None, '_num_worker_replicas': 1, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_eval_distribute': None, '_tf_random_seed': 201805, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f56c83976a0>}


数据输入函数input_fn:
输入文件：adult_train.csv
一批大小：32561
迭代的轮次：None
模式：eval
Thread Count:4
Shuffle:False
/n <ParallelMapDataset shapes: ({relationship: (?,), gender: (?,), marital_status: (?

### 10、预测

In [28]:
import itertools
# 输入
predict_input_fn = lambda:csv_input_fn(TEST_DATA_FILES_PATTERN,
                                      mode = tf.estimator.ModeKeys.PREDICT,
                                      batch_size = 10)
predictions = list(itertools.islice(estimator.predict(input_fn = predict_input_fn),10))
print('')
print('预测的类别：{}'.format(list(map(lambda item: item['class_ids'][0],predictions))))
print('预测概率为：{}'.format(list(map(lambda item: list(item['probabilities']),predictions))))



数据输入函数input_fn:
输入文件：adult_test.csv
一批大小：10
迭代的轮次：None
模式：infer
Thread Count:4
Shuffle:False
/n <ParallelMapDataset shapes: ({relationship: (?,), gender: (?,), marital_status: (?,), native_country: (?,), education_num: (?,), education: (?,), fnlwgt: (?,), workclass: (?,), capital_loss: (?,), occupation: (?,), hours_per_week: (?,), capital_gain: (?,), race: (?,), age: (?,)}, (?,)), types: ({relationship: tf.string, gender: tf.string, marital_status: tf.string, native_country: tf.string, education_num: tf.int32, education: tf.string, fnlwgt: tf.int32, workclass: tf.string, capital_loss: tf.int32, occupation: tf.string, hours_per_week: tf.int32, capital_gain: tf.int32, race: tf.string, age: tf.int32}, tf.string)>
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from D&W Model/model.ckpt-6513
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

预测的类别：[0, 0, 0, 