In [2]:
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Example code for TensorFlow Wide & Deep Tutorial using TF.Learn API."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys
import tempfile

import pandas as pd
from six.moves import urllib
import tensorflow as tf

TRAIN_FILE = 'D:\Study\Ostfold\MachineLearning\git\data\\normalized_continous_data.train'
VAL_FILE = 'D:\Study\Ostfold\MachineLearning\git\data\\normalized_continous_data.val'

CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"
]

def prepare_data(file):
    df_data = pd.read_csv(
        tf.gfile.Open(file),
        names=CSV_COLUMNS,
        skipinitialspace=True,
        engine="python",
        sep = ' ',
        header=0
        )
    # remove NaN elements
    df_data = df_data.dropna(how="any", axis=0)
    labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
    
    return df_data, labels

train_x, train_y = prepare_data(TRAIN_FILE)
val_x, val_y = prepare_data(VAL_FILE)

display(train_x.head(5))
display(train_y.head(5))


gender = tf.feature_column.categorical_column_with_vocabulary_list(
    "gender", ["Female", "Male"])
education = tf.feature_column.categorical_column_with_vocabulary_list(
    "education", [
        "Bachelors", "HS-grad", "11th", "Masters", "9th",
        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
        "Preschool", "12th"
    ])
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    "marital_status", [
        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
    ])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    "relationship", [
        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
        "Other-relative"
    ])
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    "workclass", [
        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
    ])
race = tf.feature_column.categorical_column_with_vocabulary_list(
    "race", [
        "White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"
    ])

# To show an example of hashing:
occupation = tf.feature_column.categorical_column_with_hash_bucket(
    "occupation", hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket(
    "native_country", hash_bucket_size=1000)
# workclass = tf.feature_column.categorical_column_with_hash_bucket(
#     "workclass", hash_bucket_size=1000)
# marital_status = tf.feature_column.categorical_column_with_hash_bucket(
#     "marital_status", hash_bucket_size=1000)
# education = tf.feature_column.categorical_column_with_hash_bucket(
#     "education", hash_bucket_size=1000)
# gender = tf.feature_column.categorical_column_with_hash_bucket(
#     "gender", hash_bucket_size=1000)
# relationship = tf.feature_column.categorical_column_with_hash_bucket(
#     "relationship", hash_bucket_size=1000)
# race = tf.feature_column.categorical_column_with_hash_bucket(
#     "race", hash_bucket_size=1000)
# Continuous base columns.
age = tf.feature_column.numeric_column("age")
fnlwgt = tf.feature_column.numeric_column("fnlwgt")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

# Transformations.
age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

# Wide columns and deep columns.
base_columns = [
    gender, education, marital_status, relationship, workclass, occupation,
    native_country, age_buckets,
]

crossed_columns = [
    tf.feature_column.crossed_column(
        ["education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        ["native_country", "occupation"], hash_bucket_size=1000)
]

deep_columns = [
    # To show an example of embedding
    tf.feature_column.embedding_column(workclass, dimension=8),
    tf.feature_column.embedding_column(marital_status, dimension=8),
    tf.feature_column.embedding_column(education, dimension=8),
    tf.feature_column.embedding_column(gender, dimension=8),
    tf.feature_column.embedding_column(relationship, dimension=8),
    tf.feature_column.embedding_column(race, dimension=8),  
    tf.feature_column.embedding_column(native_country, dimension=8),
    tf.feature_column.embedding_column(occupation, dimension=8),
    fnlwgt,
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week,
]


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,0.395073,Self-emp-inc,-1.188562,Doctorate,2.295684,Married-civ-spouse,Prof-specialty,Husband,White,Male,-0.146129,-0.216157,1.588513,United-States,<=50K
1,0.02949,Local-gov,-1.164275,HS-grad,-0.416172,Never-married,Exec-managerial,Own-child,Amer-Indian-Eskimo,Female,-0.146129,-0.216157,-0.033129,United-States,<=50K
2,-1.286609,Private,-0.681046,HS-grad,-0.416172,Never-married,Farming-fishing,Own-child,White,Male,-0.146129,-0.216157,-1.330443,United-States,<=50K
3,0.833773,State-gov,-1.568646,Some-college,-0.028764,Married-civ-spouse,Adm-clerical,Husband,White,Male,-0.146129,-0.216157,-0.033129,United-States,>50K
4,-0.043626,Self-emp-not-inc,-0.239004,Prof-school,1.908276,Never-married,Prof-specialty,Not-in-family,White,Male,-0.146129,6.778748,0.372281,United-States,>50K


0    0
1    0
2    0
3    1
4    1
Name: income_bracket, dtype: int32

In [24]:
def input_fn(df_data, labels, num_epochs, shuffle):
  return tf.estimator.inputs.pandas_input_fn(
      x=df_data,
      y=labels,
      batch_size=100,
      num_epochs=num_epochs,
      shuffle=shuffle,
      num_threads=5)

In [22]:
from numpy import random
tf.logging.set_verbosity(tf.logging.FATAL)
max_count = 100
hidden_set = [2048, 1024, 512, 256, 128, 64, 32, 16] 
for count in range(max_count):
    lr = 10**random.uniform(-2, -6)
    dropout = 10**random.uniform(-1,0)
    layers = random.randint(1, 4)
    hidden_units = random.randint(1, size=layers)
    for i in range(layers):
        hidden_units[i] = hidden_set[random.randint(0,8)]
        
    hidden_units = sorted(hidden_units, reverse=True)
    
    m = tf.estimator.DNNClassifier(
        feature_columns=deep_columns,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        hidden_units=hidden_units,
        config=tf.estimator.RunConfig(tf_random_seed=1234),
        dropout=dropout,
        model_dir="D:\Study\Ostfold\MachineLearning\\tmp\\abcddfddfdf"+str(count))
    # set num_epochs to None to get infinite stream of data.
    m.train(input_fn=input_fn(train_x, train_y, num_epochs=None, shuffle=True), steps= 1000)
    # set steps to None to run evaluation until all data consumed.
    print('({}) lr: {}, dropout: {}, hidden_units {},'.format(count, lr, dropout, hidden_units))
    results = m.evaluate(input_fn=input_fn(val_x, val_y, num_epochs=1, shuffle=False), steps=None)
    print("val_accuracy: %s" % results["accuracy"])
    results1 = m.evaluate(input_fn=input_fn(train_x, train_y, num_epochs=1, shuffle=False), steps=None)
    print("train_accuracy: %s" % results1["accuracy"])

(0) lr: 0.005500564618187616, dropout: 0.13038436512453885, hidden_units [64, 16],
val_accuracy: 0.846898
train_accuracy: 0.8639383
(1) lr: 0.00014841726138673487, dropout: 0.1176881205422854, hidden_units [256, 256],
val_accuracy: 0.84812653
train_accuracy: 0.8589473
(2) lr: 1.6675329836961138e-05, dropout: 0.19835961937703753, hidden_units [1024, 16],
val_accuracy: 0.8306204
train_accuracy: 0.84201634
(3) lr: 1.9868641213153382e-05, dropout: 0.11611540066668811, hidden_units [64],
val_accuracy: 0.75767815
train_accuracy: 0.7616232
(4) lr: 0.0002831476203614318, dropout: 0.12231881678598244, hidden_units [512, 256, 128],
val_accuracy: 0.8478194
train_accuracy: 0.86121243
(5) lr: 1.4934348368510973e-06, dropout: 0.48583768295412344, hidden_units [256, 32],
val_accuracy: 0.73341525
train_accuracy: 0.7344416
(6) lr: 1.4244590388285676e-06, dropout: 0.5547050320124399, hidden_units [64],
val_accuracy: 0.75184274
train_accuracy: 0.75455904
(7) lr: 0.000562287616651841, dropout: 0.265296077

val_accuracy: 0.8363022
train_accuracy: 0.8488886
(61) lr: 0.0004980944496446797, dropout: 0.34123847812827746, hidden_units [1024],
val_accuracy: 0.85181206
train_accuracy: 0.8630552
(62) lr: 0.0013069369457546155, dropout: 0.4496748813600915, hidden_units [256],
val_accuracy: 0.8467445
train_accuracy: 0.8627097
(63) lr: 3.4574133938590794e-05, dropout: 0.7306449594190022, hidden_units [256, 128],
val_accuracy: 0.79929364
train_accuracy: 0.8024341
(64) lr: 3.179279388927927e-05, dropout: 0.2227321939603044, hidden_units [128],
val_accuracy: 0.8114251
train_accuracy: 0.82055515
(65) lr: 5.7449310541126625e-05, dropout: 0.27632888143391315, hidden_units [256, 256, 128],
val_accuracy: 0.83937347
train_accuracy: 0.85219026
(66) lr: 1.4646983971802451e-05, dropout: 0.21623751174503245, hidden_units [2048, 512, 256],
val_accuracy: 0.8389128
train_accuracy: 0.8540331
(67) lr: 0.0008248508778177342, dropout: 0.39500441909634715, hidden_units [1024, 256, 16],
val_accuracy: 0.85058355
train_acc

In [25]:
from numpy import random
tf.logging.set_verbosity(tf.logging.FATAL)
max_count = 10
hidden_set = [2048, 1024, 512, 256, 128, 64, 32, 16] 
for count in range(max_count):
    lr = 10**random.uniform(-2, -6)
    dropout = 10**random.uniform(-1,0)
    layers = random.randint(1, 4)
    hidden_units = random.randint(1, size=layers)
    for i in range(layers):
        hidden_units[i] = hidden_set[random.randint(0,8)]
        
    hidden_units = sorted(hidden_units, reverse=True)
    
    m = tf.estimator.DNNClassifier(
        feature_columns=deep_columns,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        hidden_units=hidden_units,
        config=tf.estimator.RunConfig(tf_random_seed=1234),
        dropout=dropout,
        model_dir="D:\Study\Ostfold\MachineLearning\\tmp\\abcbmfdf"+str(count))
    # set num_epochs to None to get infinite stream of data.
    m.train(input_fn=input_fn(train_x, train_y, num_epochs=None, shuffle=True), steps= 1000)
    # set steps to None to run evaluation until all data consumed.
    print('({}) lr: {}, dropout: {}, hidden_units {},'.format(count, lr, dropout, hidden_units))
    results = m.evaluate(input_fn=input_fn(val_x, val_y, num_epochs=1, shuffle=False), steps=None)
    print("val_accuracy: %s" % results["accuracy"])
    results1 = m.evaluate(input_fn=input_fn(train_x, train_y, num_epochs=1, shuffle=False), steps=None)
    print("train_accuracy: %s" % results1["accuracy"])

(0) lr: 0.001976463290256069, dropout: 0.25928638718890495, hidden_units [2048, 2048, 2048],
val_accuracy: 0.847973
train_accuracy: 0.86578107
(1) lr: 0.0001945002079770521, dropout: 0.2387614123625215, hidden_units [512],
val_accuracy: 0.8462838
train_accuracy: 0.8562598
(2) lr: 9.085397243827349e-05, dropout: 0.10471742458566484, hidden_units [256, 128, 16],
val_accuracy: 0.847973
train_accuracy: 0.85856336
(3) lr: 0.0012149627464687865, dropout: 0.2953217688392041, hidden_units [2048],
val_accuracy: 0.8452088
train_accuracy: 0.8620954
(4) lr: 0.0014362700672145502, dropout: 0.4096269831049758, hidden_units [16],
val_accuracy: 0.8461302
train_accuracy: 0.8562598
(5) lr: 0.0013669998998767994, dropout: 0.2619143853982457, hidden_units [1024, 512, 128],
val_accuracy: 0.8496622
train_accuracy: 0.8667409
(6) lr: 0.005257755576877597, dropout: 0.14011608172384438, hidden_units [64, 16],
val_accuracy: 0.8485872
train_accuracy: 0.86539716
(7) lr: 2.455762093774032e-06, dropout: 0.4877939555

In [17]:
TEST_FILE = 'D:\Study\Ostfold\MachineLearning\git\data\\normalized_continous_data.val'

test_x, test_y = prepare_data(TEST_FILE)
results = m.evaluate(input_fn=input_fn(test_x, test_y, num_epochs=1, shuffle=False), steps=None)
print("val_accuracy: %s" % results["accuracy"])

INFO:tensorflow:Starting evaluation at 2018-03-07-22:08:05
INFO:tensorflow:Restoring parameters from D:\Study\Ostfold\MachineLearning\tmp\abcdddfhr0\model.ckpt-3000
INFO:tensorflow:Finished evaluation at 2018-03-07-22:08:09
INFO:tensorflow:Saving dict for global step 3000: accuracy = 0.84428746, accuracy_baseline = 0.7566032, auc = 0.8988572, auc_precision_recall = 0.756032, average_loss = 0.37128368, global_step = 3000, label/mean = 0.2433968, loss = 37.082806, prediction/mean = 0.25394177
val_accuracy: 0.84428746
