## UCI Adult Data Set 
### Dataset URL: https://archive.ics.uci.edu/ml/datasets/adult
Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset.

In [1]:
import shutil
import math
from datetime import datetime
import multiprocessing

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import data
from tensorflow.python.feature_column import feature_column

print(tf.__version__)

  return f(*args, **kwds)


1.4.1


In [2]:
MODEL_NAME = 'cenus-model-01'

TRAIN_DATA_FILES_PATTERN = 'data/adult.data.csv'
TEST_DATA_FILES_PATTERN = 'data/adult.test.csv'

RESUME_TRAINING = False
PROCESS_FEATURES = True
EXTEND_FEATURE_COLUMNS = True
MULTI_THREADING = True

## Define Dataset Metadata

In [3]:
HEADER = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
               'marital_status', 'occupation', 'relationship', 'race', 'gender',
               'capital_gain', 'capital_loss', 'hours_per_week',
               'native_country', 'income_bracket']

HEADER_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
                       [0], [0], [0], [''], ['']]

NUMERIC_FEATURE_NAMES = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY = {
    'gender': ['Female', 'Male'],
    
    'race': ['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'],
    
    'education': ['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 
                  'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', 
                  '5th-6th', '10th', '1st-4th', 'Preschool', '12th'],
    
    'marital_status': ['Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 
                       'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'],
    
    'relationship': ['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'],
    
    'workclass': ['Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 'Local-gov', '?', 
                  'Self-emp-inc', 'Without-pay', 'Never-worked']
}

CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE = {
    'occupation': 50,
    'native_country' : 100
}

CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.keys()) + list(CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

TARGET_NAME = 'income_bracket'

TARGET_LABELS = ['<=50K', '>50K']

WEIGHT_COLUMN_NAME = 'fnlwgt'

UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES) - {TARGET_NAME})


print("Header: {}".format(HEADER))
print("Numeric Features: {}".format(NUMERIC_FEATURE_NAMES))
print("Categorical Features: {}".format(CATEGORICAL_FEATURE_NAMES))
print("Target: {} - labels: {}".format(TARGET_NAME, TARGET_LABELS))
print("Unused Features: {}".format(UNUSED_FEATURE_NAMES))

Header: ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_bracket']
Numeric Features: ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
Categorical Features: ['gender', 'race', 'education', 'marital_status', 'relationship', 'workclass', 'occupation', 'native_country']
Target: income_bracket - labels: ['<=50K', '>50K']
Unused Features: []


## Load and Analyse Dataset

In [4]:
TRAIN_DATA_SIZE = 32561
TEST_DATA_SIZE = 16278

train_data = pd.read_csv(TRAIN_DATA_FILES_PATTERN, header=None, names=HEADER )
train_data.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [5]:
train_data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


### Compute Scaling Statistics for Numeric Columns

In [6]:
means = train_data[NUMERIC_FEATURE_NAMES].mean(axis=0)
stdvs = train_data[NUMERIC_FEATURE_NAMES].std(axis=0)
maxs = train_data[NUMERIC_FEATURE_NAMES].max(axis=0)
mins = train_data[NUMERIC_FEATURE_NAMES].min(axis=0)
df_stats = pd.DataFrame({"mean":means, "stdv":stdvs, "max":maxs, "min":mins})
df_stats.head(15)

Unnamed: 0,max,mean,min,stdv
age,90,38.581647,17,13.640433
fnlwgt,1484705,189778.366512,12285,105549.977697
education_num,16,10.080679,1,2.57272
capital_gain,99999,1077.648844,0,7385.292085
capital_loss,4356,87.30383,0,402.960219
hours_per_week,99,40.437456,1,12.347429


### Save Scaling Statistics

In [7]:
df_stats.to_csv(path_or_buf="data/adult.stats.csv", header=True, index=True)

## Define Data Input Function


### a. Parsing and preprocessing logic

In [8]:
def parse_csv_row(csv_row):
    
    columns = tf.decode_csv(csv_row, record_defaults=HEADER_DEFAULTS)
    features = dict(zip(HEADER, columns))
    
    for column in UNUSED_FEATURE_NAMES:
        features.pop(column)
    
    target = features.pop(TARGET_NAME)

    return features, target

def process_features(features):

    capital_indicator = features['capital_gain'] > features['capital_loss']
    features['capital_indicator'] = tf.cast(capital_indicator, dtype=tf.int32)
    
    return features

### b. Data pipeline input function

In [9]:
def parse_label_column(label_string_tensor):
    table = tf.contrib.lookup.index_table_from_tensor(tf.constant(TARGET_LABELS))
    return table.lookup(label_string_tensor)

def csv_input_fn(files_name_pattern, mode=tf.estimator.ModeKeys.EVAL, 
                 skip_header_lines=0, 
                 num_epochs=None, 
                 batch_size=200):
    
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
        
    num_threads = multiprocessing.cpu_count() if MULTI_THREADING else 1
     
    print("")
    print("* data input_fn:")
    print("================")
    print("Input file(s): {}".format(files_name_pattern))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Mode: {}".format(mode))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")

    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TextLineDataset(filenames=file_names)
    
    dataset = dataset.skip(skip_header_lines)
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda csv_row: parse_csv_row(csv_row), 
                          num_parallel_calls=num_threads)
    
    if PROCESS_FEATURES:
        dataset = dataset.map(lambda features, target: (process_features(features), target), 
                              num_parallel_calls=num_threads)
        
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()
    
    features, target = iterator.get_next()
    return features, parse_label_column(target)

In [10]:
features, target = csv_input_fn(files_name_pattern="")
print("Features in CSV: {}".format(list(features.keys())))
print("Target in CSV: {}".format(target))


* data input_fn:
Input file(s): 
Batch size: 200
Epoch Count: None
Mode: eval
Thread Count: 4
Shuffle: False

Features in CSV: ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'capital_indicator']
Target in CSV: Tensor("hash_table_Lookup:0", shape=(?,), dtype=int64)


## Define Feature Columns
### a. Load scaling params

In [11]:
df_stats = pd.read_csv("data/adult.stats.csv", header=0, index_col=0)
df_stats['feature_name'] = NUMERIC_FEATURE_NAMES
df_stats.head(10)

Unnamed: 0,max,mean,min,stdv,feature_name
age,90,38.581647,17,13.640433,age
fnlwgt,1484705,189778.366512,12285,105549.977697,fnlwgt
education_num,16,10.080679,1,2.57272,education_num
capital_gain,99999,1077.648844,0,7385.292085,capital_gain
capital_loss,4356,87.30383,0,402.960219,capital_loss
hours_per_week,99,40.437456,1,12.347429,hours_per_week


### b. Create feature columns

In [12]:
def extend_feature_columns(feature_columns, hparams):
    
    age_buckets = tf.feature_column.bucketized_column(
      feature_columns['age'], boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    
    education_X_occupation = tf.feature_column.crossed_column(
     ['education', 'occupation'], hash_bucket_size=int(1e4))
    
    age_buckets_X_race = tf.feature_column.crossed_column(
     [age_buckets, feature_columns['race']], hash_bucket_size=int(1e4))
    
    native_country_X_occupation = tf.feature_column.crossed_column(
          ['native_country', 'occupation'], hash_bucket_size=int(1e4))
    
    native_country_embedded = tf.feature_column.embedding_column(
          feature_columns['native_country'], dimension=hparams.embedding_size)
    
    occupation_embedded = tf.feature_column.embedding_column(
          feature_columns['occupation'], dimension=hparams.embedding_size)
    
    education_X_occupation_embedded = tf.feature_column.embedding_column(
          education_X_occupation, dimension=hparams.embedding_size)
    
    native_country_X_occupation_embedded = tf.feature_column.embedding_column(
          native_country_X_occupation, dimension=hparams.embedding_size)
    
    
    feature_columns['age_buckets'] = age_buckets
    feature_columns['education_X_occupation'] = education_X_occupation
    feature_columns['age_buckets_X_race'] = age_buckets_X_race
    feature_columns['native_country_X_occupation'] = native_country_X_occupation
    feature_columns['native_country_embedded'] = native_country_embedded
    feature_columns['occupation_embedded'] = occupation_embedded
    feature_columns['education_X_occupation_embedded'] = education_X_occupation_embedded
    feature_columns['native_country_X_occupation_embedded'] = native_country_X_occupation_embedded
    
    return feature_columns

def standard_scaler(x, mean, stdv):
    return (x-mean)/(stdv)

def maxmin_scaler(x, max_value, min_value):
    return (x-min_value)/(max_value-min_value)  

def get_feature_columns(hparams):
    
    
    numeric_columns = {}
    
    for feature_name in NUMERIC_FEATURE_NAMES:

        feature_mean = df_stats[df_stats.feature_name == feature_name]['mean'].values[0]
        feature_stdv = df_stats[df_stats.feature_name == feature_name]['stdv'].values[0]
        normalizer_fn = lambda x: standard_scaler(x, feature_mean, feature_stdv)
        
        numeric_columns[feature_name] = tf.feature_column.numeric_column(feature_name, 
                                                                         normalizer_fn=normalizer_fn
                                                                        )
    CONSTRUCTED_NUMERIC_FEATURES_NAMES = []
    
    if PROCESS_FEATURES:
        for feature_name in CONSTRUCTED_NUMERIC_FEATURES_NAMES:
            numeric_columns[feature_name] = tf.feature_column.numeric_column(feature_name)
    
    categorical_column_with_vocabulary = \
        {item[0]: tf.feature_column.categorical_column_with_vocabulary_list(item[0], item[1])
         for item in CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.items()}
        
    CONSTRUCTED_INDICATOR_FEATURES_NAMES = ['capital_indicator']
    
    categorical_column_with_identity = {}
    
    for feature_name in CONSTRUCTED_INDICATOR_FEATURES_NAMES: 
        categorical_column_with_identity[feature_name] = tf.feature_column.categorical_column_with_identity(feature_name, 
                                                                                                              num_buckets=2,
                                                                                                              default_value=0)
    categorical_column_with_hash_bucket = \
        {item[0]: tf.feature_column.categorical_column_with_hash_bucket(item[0], item[1], dtype=tf.string)
         for item in CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.items()}
        
    feature_columns = {}

    if numeric_columns is not None:
        feature_columns.update(numeric_columns)

    if categorical_column_with_vocabulary is not None:
        feature_columns.update(categorical_column_with_vocabulary)
        
    if categorical_column_with_identity is not None:
        feature_columns.update(categorical_column_with_identity)
        
    if categorical_column_with_hash_bucket is not None:
        feature_columns.update(categorical_column_with_hash_bucket)
    
    if EXTEND_FEATURE_COLUMNS:
        feature_columns = extend_feature_columns(feature_columns, hparams)
        
    return feature_columns

feature_columns = get_feature_columns(tf.contrib.training.HParams(num_buckets=5,embedding_size=3))
print("Feature Columns: {}".format(feature_columns))

Feature Columns: {'age': _NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x11f196510>), 'fnlwgt': _NumericColumn(key='fnlwgt', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x11f196378>), 'education_num': _NumericColumn(key='education_num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x11f196598>), 'capital_gain': _NumericColumn(key='capital_gain', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x11f196620>), 'capital_loss': _NumericColumn(key='capital_loss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x11f1966a8>), 'hours_per_week': _NumericColumn(key='hours_per_week', shape=(1,), default_value=None, dtype=tf.float32, 

## Define a DNN  Estimator Creation Function

### a. Get wide and deep feature columns

In [13]:
def get_wide_deep_columns():
    
    feature_columns = list(get_feature_columns(hparams).values())
    
    dense_columns = list(
        filter(lambda column: isinstance(column, feature_column._NumericColumn) |
                              isinstance(column, feature_column._EmbeddingColumn),
               feature_columns
        )
    )

    categorical_columns = list(
        filter(lambda column: isinstance(column, feature_column._VocabularyListCategoricalColumn) |
                              isinstance(column, feature_column._IdentityCategoricalColumn) |
                              isinstance(column, feature_column._BucketizedColumn),
                   feature_columns)
    )
    
    sparse_columns = list(
        filter(lambda column: isinstance(column,feature_column._HashedCategoricalColumn) |
                              isinstance(column, feature_column._CrossedColumn),
               feature_columns)
    )

    indicator_columns = list(
            map(lambda column: tf.feature_column.indicator_column(column),
                categorical_columns)
    )
    
    deep_feature_columns = dense_columns + indicator_columns
    wide_feature_columns = categorical_columns + sparse_columns
    
    return wide_feature_columns, deep_feature_columns

### b. Define the estimator

In [14]:
def create_DNNComb_estimator(run_config, hparams, print_desc=False):
    
    wide_feature_columns, deep_feature_columns = get_wide_deep_columns()
    
    estimator = tf.estimator.DNNLinearCombinedClassifier(
        
        n_classes=len(TARGET_LABELS),
        
        dnn_feature_columns = deep_feature_columns,
        linear_feature_columns = wide_feature_columns,
        
        weight_column=WEIGHT_COLUMN_NAME,
        
        dnn_hidden_units= hparams.hidden_units,
        
        dnn_optimizer= tf.train.AdamOptimizer(),
        
        dnn_activation_fn= tf.nn.relu,
        
        config= run_config
    )
    
    
    if print_desc:
        print("")
        print("*Estimator Type:")
        print("================")
        print(type(estimator))
        print("")
        print("*deep columns:")
        print("==============")
        print(deep_feature_columns)
        print("")
        print("wide columns:")
        print("=============")
        print(wide_feature_columns)
        print("")
    
    return estimator

## 6. Run Experiment

### a. Set HParam and RunConfig

In [15]:
TRAIN_SIZE = TRAIN_DATA_SIZE
NUM_EPOCHS = 1000
BATCH_SIZE = 500
EVAL_AFTER_SEC = 60
TOTAL_STEPS = (TRAIN_SIZE/BATCH_SIZE)*NUM_EPOCHS

hparams  = tf.contrib.training.HParams(
    num_epochs = NUM_EPOCHS,
    batch_size = BATCH_SIZE,
    embedding_size = 4,
    hidden_units= [64, 32, 16],
    max_steps = TOTAL_STEPS
)

model_dir = 'trained_models/{}'.format(MODEL_NAME)

run_config = tf.estimator.RunConfig(
    log_step_count_steps=5000,
    tf_random_seed=19830610,
    model_dir=model_dir
)

print(hparams)
print("Model Directory:", run_config.model_dir)
print("")
print("Dataset Size:", TRAIN_SIZE)
print("Batch Size:", BATCH_SIZE)
print("Steps per Epoch:",TRAIN_SIZE/BATCH_SIZE)
print("Total Steps:", TOTAL_STEPS)
print("That is 1 evaluation step after each",EVAL_AFTER_SEC," training seconds")

[('batch_size', 500), ('embedding_size', 4), ('hidden_units', [64, 32, 16]), ('max_steps', 65122.0), ('num_epochs', 1000)]
Model Directory: trained_models/cenus-model-01

Dataset Size: 32561
Batch Size: 500
Steps per Epoch: 65.122
Total Steps: 65122.0
That is 1 evaluation step after each 60  training seconds


### b. Define TrainSpec and EvaluSpec

In [16]:
train_spec = tf.estimator.TrainSpec(
    input_fn = lambda: csv_input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode = tf.estimator.ModeKeys.TRAIN,
        num_epochs=hparams.num_epochs,
        batch_size=hparams.batch_size
    ),
    max_steps=hparams.max_steps,
    hooks=None
)

eval_spec = tf.estimator.EvalSpec(
    input_fn = lambda: csv_input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode=tf.estimator.ModeKeys.EVAL,
        num_epochs=1,
        batch_size=hparams.batch_size,
            
    ),
    throttle_secs = EVAL_AFTER_SEC,
    steps=None
)

### c. Run Experiment via train_and_evaluate

In [17]:
if not RESUME_TRAINING:
    print("Removing previous artifacts...")
    shutil.rmtree(model_dir, ignore_errors=True)
else:
    print("Resuming training...") 

    
tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.utcnow() 
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

estimator = create_DNNComb_estimator(run_config, hparams, True)

tf.estimator.train_and_evaluate(
    estimator=estimator,
    train_spec=train_spec, 
    eval_spec=eval_spec
)

time_end = datetime.utcnow() 
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
    

Removing previous artifacts...
Experiment started at 23:22:17
.......................................
INFO:tensorflow:Using config: {'_model_dir': 'trained_models/cenus-model-01', '_tf_random_seed': 19830610, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11f1dd748>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

*Estimator Type:
<class 'tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier'>

*deep columns:
[_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x11f1c28c8>), _NumericColumn(key='fnlwgt', shape=(1,), default_value=Non

INFO:tensorflow:loss = 1.04785e+08, step = 301 (0.639 sec)
INFO:tensorflow:loss = 5.595e+08, step = 401 (0.616 sec)
INFO:tensorflow:loss = 1.12901e+08, step = 501 (0.616 sec)
INFO:tensorflow:loss = 1.31533e+08, step = 601 (0.613 sec)
INFO:tensorflow:loss = 2.23135e+08, step = 701 (0.617 sec)
INFO:tensorflow:loss = 6.42206e+07, step = 801 (0.617 sec)
INFO:tensorflow:loss = 1.84873e+08, step = 901 (0.620 sec)
INFO:tensorflow:loss = 1.06171e+08, step = 1001 (0.613 sec)
INFO:tensorflow:loss = 2.0627e+08, step = 1101 (0.617 sec)
INFO:tensorflow:loss = 5.64006e+07, step = 1201 (0.633 sec)
INFO:tensorflow:loss = 9.15162e+07, step = 1301 (0.632 sec)
INFO:tensorflow:loss = 4.70419e+07, step = 1401 (0.631 sec)
INFO:tensorflow:loss = 5.90472e+07, step = 1501 (0.632 sec)
INFO:tensorflow:loss = 1.31396e+08, step = 1601 (0.629 sec)
INFO:tensorflow:loss = 2.99965e+08, step = 1701 (0.635 sec)
INFO:tensorflow:loss = 1.0285e+08, step = 1801 (0.633 sec)
INFO:tensorflow:loss = 1.35575e+08, step = 1901 (0.

INFO:tensorflow:loss = 3.00118e+07, step = 11796 (0.624 sec)
INFO:tensorflow:loss = 2.64916e+07, step = 11896 (0.617 sec)
INFO:tensorflow:loss = 3.18369e+07, step = 11996 (0.630 sec)
INFO:tensorflow:loss = 2.80614e+07, step = 12096 (0.618 sec)
INFO:tensorflow:global_step/sec: 157.031
INFO:tensorflow:loss = 2.5901e+07, step = 12196 (0.619 sec)
INFO:tensorflow:loss = 2.57176e+07, step = 12296 (0.616 sec)
INFO:tensorflow:loss = 2.73206e+07, step = 12396 (0.631 sec)
INFO:tensorflow:loss = 3.02439e+07, step = 12496 (0.628 sec)
INFO:tensorflow:loss = 2.61246e+07, step = 12596 (0.627 sec)
INFO:tensorflow:loss = 3.55542e+07, step = 12696 (0.616 sec)
INFO:tensorflow:loss = 3.5754e+07, step = 12796 (0.618 sec)
INFO:tensorflow:loss = 3.29661e+07, step = 12896 (0.617 sec)
INFO:tensorflow:loss = 3.8542e+07, step = 12996 (0.627 sec)
INFO:tensorflow:loss = 2.43692e+07, step = 13096 (0.619 sec)
INFO:tensorflow:loss = 2.99774e+07, step = 13196 (0.620 sec)
INFO:tensorflow:loss = 3.36655e+07, step = 1329

INFO:tensorflow:loss = 2.90322e+07, step = 20816 (0.626 sec)
INFO:tensorflow:loss = 2.78443e+07, step = 20916 (0.626 sec)
INFO:tensorflow:loss = 3.10652e+07, step = 21016 (0.743 sec)
INFO:tensorflow:loss = 3.27701e+07, step = 21116 (0.635 sec)
INFO:tensorflow:loss = 3.01198e+07, step = 21216 (0.628 sec)
INFO:tensorflow:loss = 2.8235e+07, step = 21316 (0.629 sec)
INFO:tensorflow:loss = 3.09576e+07, step = 21416 (0.624 sec)
INFO:tensorflow:loss = 3.06293e+07, step = 21516 (0.628 sec)
INFO:tensorflow:loss = 3.51966e+07, step = 21616 (0.621 sec)
INFO:tensorflow:loss = 3.16252e+07, step = 21716 (0.631 sec)
INFO:tensorflow:loss = 2.76478e+07, step = 21816 (0.622 sec)
INFO:tensorflow:loss = 2.93657e+07, step = 21916 (0.625 sec)
INFO:tensorflow:loss = 2.75563e+07, step = 22016 (0.623 sec)
INFO:tensorflow:loss = 3.05096e+07, step = 22116 (0.623 sec)
INFO:tensorflow:loss = 3.09714e+07, step = 22216 (0.620 sec)
INFO:tensorflow:loss = 3.63719e+07, step = 22316 (0.626 sec)
INFO:tensorflow:loss = 3.

INFO:tensorflow:loss = 2.61099e+07, step = 32102 (0.677 sec)
INFO:tensorflow:loss = 2.92053e+07, step = 32202 (0.695 sec)
INFO:tensorflow:loss = 2.97349e+07, step = 32302 (0.685 sec)
INFO:tensorflow:global_step/sec: 137.785
INFO:tensorflow:loss = 3.16498e+07, step = 32402 (0.690 sec)
INFO:tensorflow:loss = 2.7229e+07, step = 32502 (0.687 sec)
INFO:tensorflow:loss = 2.74572e+07, step = 32602 (0.701 sec)
INFO:tensorflow:loss = 3.11997e+07, step = 32702 (0.682 sec)
INFO:tensorflow:loss = 2.26085e+07, step = 32802 (0.714 sec)
INFO:tensorflow:loss = 3.19536e+07, step = 32902 (0.723 sec)
INFO:tensorflow:loss = 3.48984e+07, step = 33002 (0.686 sec)
INFO:tensorflow:loss = 3.47855e+07, step = 33102 (0.688 sec)
INFO:tensorflow:loss = 3.46503e+07, step = 33202 (0.705 sec)
INFO:tensorflow:loss = 2.96276e+07, step = 33302 (0.686 sec)
INFO:tensorflow:loss = 3.19864e+07, step = 33402 (0.695 sec)
INFO:tensorflow:loss = 3.73658e+07, step = 33502 (0.735 sec)
INFO:tensorflow:loss = 2.58759e+07, step = 33

INFO:tensorflow:Saving checkpoints for 41276 into trained_models/cenus-model-01/model.ckpt.
INFO:tensorflow:loss = 3.15346e+07, step = 41276
INFO:tensorflow:loss = 3.07187e+07, step = 41376 (1.262 sec)
INFO:tensorflow:loss = 3.25912e+07, step = 41476 (0.679 sec)
INFO:tensorflow:loss = 2.87063e+07, step = 41576 (0.678 sec)
INFO:tensorflow:loss = 2.66451e+07, step = 41676 (0.668 sec)
INFO:tensorflow:loss = 2.71033e+07, step = 41776 (0.828 sec)
INFO:tensorflow:loss = 2.96147e+07, step = 41876 (0.792 sec)
INFO:tensorflow:loss = 2.77691e+07, step = 41976 (0.750 sec)
INFO:tensorflow:loss = 2.71483e+07, step = 42076 (0.748 sec)
INFO:tensorflow:loss = 3.08104e+07, step = 42176 (0.739 sec)
INFO:tensorflow:loss = 3.14236e+07, step = 42276 (0.721 sec)
INFO:tensorflow:loss = 2.94911e+07, step = 42376 (0.679 sec)
INFO:tensorflow:loss = 2.97291e+07, step = 42476 (0.674 sec)
INFO:tensorflow:loss = 2.98922e+07, step = 42576 (0.707 sec)
INFO:tensorflow:loss = 2.6375e+07, step = 42676 (0.788 sec)
INFO:t

INFO:tensorflow:global_step/sec: 144.835
INFO:tensorflow:loss = 2.43618e+07, step = 52415 (0.676 sec)
INFO:tensorflow:loss = 2.48532e+07, step = 52515 (0.703 sec)
INFO:tensorflow:loss = 2.52918e+07, step = 52615 (0.685 sec)
INFO:tensorflow:loss = 3.10356e+07, step = 52715 (0.678 sec)
INFO:tensorflow:loss = 2.21702e+07, step = 52815 (0.698 sec)
INFO:tensorflow:loss = 3.20212e+07, step = 52915 (0.671 sec)
INFO:tensorflow:loss = 2.72997e+07, step = 53015 (0.697 sec)
INFO:tensorflow:loss = 3.22148e+07, step = 53115 (0.675 sec)
INFO:tensorflow:loss = 3.44765e+07, step = 53215 (0.674 sec)
INFO:tensorflow:loss = 2.34586e+07, step = 53315 (0.674 sec)
INFO:tensorflow:loss = 3.06289e+07, step = 53415 (0.676 sec)
INFO:tensorflow:loss = 3.4327e+07, step = 53515 (0.682 sec)
INFO:tensorflow:loss = 2.49605e+07, step = 53615 (0.676 sec)
INFO:tensorflow:loss = 2.92301e+07, step = 53715 (0.688 sec)
INFO:tensorflow:loss = 3.29257e+07, step = 53815 (0.664 sec)
INFO:tensorflow:loss = 2.91582e+07, step = 53

INFO:tensorflow:loss = 2.88309e+07, step = 61509 (0.633 sec)
INFO:tensorflow:loss = 2.54539e+07, step = 61609 (0.620 sec)
INFO:tensorflow:loss = 2.73787e+07, step = 61709 (0.625 sec)
INFO:tensorflow:loss = 2.63387e+07, step = 61809 (0.621 sec)
INFO:tensorflow:loss = 2.84528e+07, step = 61909 (0.626 sec)
INFO:tensorflow:loss = 2.86117e+07, step = 62009 (0.623 sec)
INFO:tensorflow:loss = 2.8486e+07, step = 62109 (0.627 sec)
INFO:tensorflow:loss = 2.67e+07, step = 62209 (0.619 sec)
INFO:tensorflow:loss = 2.44365e+07, step = 62309 (0.626 sec)
INFO:tensorflow:loss = 2.4249e+07, step = 62409 (0.623 sec)
INFO:tensorflow:loss = 2.71835e+07, step = 62509 (0.744 sec)
INFO:tensorflow:loss = 2.85429e+07, step = 62609 (0.704 sec)
INFO:tensorflow:loss = 2.65406e+07, step = 62709 (1.085 sec)
INFO:tensorflow:loss = 3.06317e+07, step = 62809 (0.985 sec)
INFO:tensorflow:loss = 2.47769e+07, step = 62909 (0.687 sec)
INFO:tensorflow:loss = 2.6125e+07, step = 63009 (0.644 sec)
INFO:tensorflow:loss = 3.04107

## Evaluate the Model

In [18]:
TRAIN_SIZE = TRAIN_DATA_SIZE
TEST_SIZE = TEST_DATA_SIZE

train_input_fn = lambda: csv_input_fn(files_name_pattern= TRAIN_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TRAIN_SIZE)

test_input_fn = lambda: csv_input_fn(files_name_pattern= TEST_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TEST_SIZE)

estimator = create_DNNComb_estimator(run_config, hparams)

train_results = estimator.evaluate(input_fn=train_input_fn, steps=1)
print()
print("######################################################################################")
print("# Train Measures: {}".format(train_results))
print("######################################################################################")

test_results = estimator.evaluate(input_fn=test_input_fn, steps=1)
print()
print("######################################################################################")
print("# Test Measures: {}".format(test_results))
print("######################################################################################")

INFO:tensorflow:Using config: {'_model_dir': 'trained_models/cenus-model-01', '_tf_random_seed': 19830610, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11f1dd748>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

* data input_fn:
Input file(s): data/adult.data.csv
Batch size: 32561
Epoch Count: None
Mode: eval
Thread Count: 4
Shuffle: False

INFO:tensorflow:Starting evaluation at 2017-12-18-23:34:38
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-01/model.ckpt-65122
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2017-12-18-23:34:41
INFO:tensorflow:Saving dict for global step 65122: accuracy = 0.863087, 

## Prediction

In [19]:
import itertools

predict_input_fn = lambda: csv_input_fn(TEST_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.PREDICT,
                                      batch_size= 10)


predictions = list(itertools.islice(estimator.predict(input_fn=predict_input_fn),10))

print("")
print("* Predicted Classes: {}".format(list(map(lambda item: item["class_ids"][0]
    ,predictions))))

print("* Predicted Probabilities: {}".format(list(map(lambda item: list(item["probabilities"])
    ,predictions))))


* data input_fn:
Input file(s): data/adult.test.csv
Batch size: 10
Epoch Count: None
Mode: infer
Thread Count: 4
Shuffle: False

INFO:tensorflow:Restoring parameters from trained_models/cenus-model-01/model.ckpt-65122

* Predicted Classes: [0, 0, 0, 1, 0, 0, 0, 1, 0, 0]
* Predicted Probabilities: [[0.99751943, 0.002480583], [0.93063581, 0.06936422], [0.68528271, 0.31471729], [0.096927784, 0.90307224], [0.99360067, 0.006399354], [0.99461657, 0.0053833956], [0.99565083, 0.0043491907], [0.37247515, 0.62752485], [0.9979977, 0.0020023116], [0.94563544, 0.054364558]]
