In [24]:
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Example code for TensorFlow Wide & Deep Tutorial using TF.Learn API."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys
import tempfile

import pandas as pd
from six.moves import urllib
import tensorflow as tf


CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"
]

gender = tf.feature_column.categorical_column_with_vocabulary_list(
    "gender", ["Female", "Male"])
education = tf.feature_column.categorical_column_with_vocabulary_list(
    "education", [
        "Bachelors", "HS-grad", "11th", "Masters", "9th",
        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
        "Preschool", "12th"
    ])
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    "marital_status", [
        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
    ])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    "relationship", [
        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
        "Other-relative"
    ])
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    "workclass", [
        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
    ])
race = tf.feature_column.categorical_column_with_vocabulary_list(
    "race", [
        "White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"
    ])

# To show an example of hashing:
occupation = tf.feature_column.categorical_column_with_hash_bucket(
    "occupation", hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket(
    "native_country", hash_bucket_size=1000)

# Continuous base columns.
age = tf.feature_column.numeric_column("age")
fnlwgt = tf.feature_column.numeric_column("fnlwgt")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

# Transformations.
age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

# Wide columns and deep columns.
base_columns = [
    gender, education, marital_status, relationship, workclass, occupation,
    native_country, age_buckets,
]

crossed_columns = [
    tf.feature_column.crossed_column(
        ["education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        ["native_country", "occupation"], hash_bucket_size=1000)
]

deep_columns = [
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.indicator_column(education),
    tf.feature_column.indicator_column(gender),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.indicator_column(race),
    # To show an example of embedding
    tf.feature_column.embedding_column(native_country, dimension=8),
    tf.feature_column.embedding_column(occupation, dimension=8),
#     fnlwgt,
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week,
]

def maybe_download(train_data, test_data):
  """Maybe downloads training data and returns train and test file names."""
  if train_data:
    train_file_name = train_data
  else:
    train_file = tempfile.NamedTemporaryFile(delete=False)
    urllib.request.urlretrieve(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        train_file.name)  # pylint: disable=line-too-long
    train_file_name = train_file.name
    train_file.close()
    print("Training data is downloaded to %s" % train_file_name)

  if test_data:
    test_file_name = test_data
  else:
    test_file = tempfile.NamedTemporaryFile(delete=False)
    urllib.request.urlretrieve(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
        test_file.name)  # pylint: disable=line-too-long
    test_file_name = test_file.name
    test_file.close()
    print("Test data is downloaded to %s"% test_file_name)

  return train_file_name, test_file_name


def build_estimator(model_dir, model_type):
  """Build an estimator."""
  if model_type == "wide":
    m = tf.estimator.LinearClassifier(
        model_dir=model_dir, feature_columns=base_columns + crossed_columns)
  elif model_type == "deep":
    m = tf.estimator.DNNClassifier(
        model_dir=model_dir,
        feature_columns=deep_columns,
#         optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
        hidden_units=[100, 75, 50, 25],
#         dropout = 0.4,
        config=tf.estimator.RunConfig(tf_random_seed=1234))
  else:
    m = tf.estimator.DNNLinearCombinedClassifier(
        model_dir=model_dir,
        linear_feature_columns=crossed_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=[100, 50])
  return m


def input_fn(data_file, num_epochs, shuffle):
  """Input builder function."""
  df_data = pd.read_csv(
      tf.gfile.Open(data_file),
      names=CSV_COLUMNS,
      skipinitialspace=True,
      engine="python",
      skiprows=1,
#       sep = ' ',
#       header=0
  )
  # remove NaN elements
  df_data = df_data.dropna(how="any", axis=0)
  labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
  return tf.estimator.inputs.pandas_input_fn(
      x=df_data,
      y=labels,
      batch_size=100,
      num_epochs=num_epochs,
      shuffle=shuffle,
      num_threads=5)


def train_and_eval(model_dir, model_type, train_steps, train_data, test_data):
  """Train and evaluate the model."""
  train_file_name, test_file_name = maybe_download(train_data, test_data)
  model_dir = tempfile.mkdtemp() if not model_dir else model_dir

  m = build_estimator(model_dir, model_type)
  # set num_epochs to None to get infinite stream of data.
  m.train(
      input_fn=input_fn(train_file_name, num_epochs=None, shuffle=True),
      steps=train_steps)
  # set steps to None to run evaluation until all data consumed.
  results = m.evaluate(
      input_fn=input_fn(test_file_name, num_epochs=1, shuffle=False),
      steps=None)
  results_train = m.evaluate(
      input_fn=input_fn(train_file_name, num_epochs=1, shuffle=False),
      steps=None)
#   print("model directory = %s" % model_dir)
#   for key in sorted(results):
#     print("%s: %s" % (key, results[key]))


FLAGS = None


# def main(_):
#   train_and_eval(FLAGS.model_dir, FLAGS.model_type, FLAGS.train_steps,
#                  FLAGS.train_data, FLAGS.test_data)


# if __name__ == "__main__":
#   parser = argparse.ArgumentParser()
#   parser.register("type", "bool", lambda v: v.lower() == "true")
#   parser.add_argument(
#       "--model_dir",
#       type=str,
#       default="",
#       help="Base directory for output models."
#   )
#   parser.add_argument(
#       "--model_type",
#       type=str,
#       default="wide_n_deep",
#       help="Valid model types: {'wide', 'deep', 'wide_n_deep'}."
#   )
#   parser.add_argument(
#       "--train_steps",
#       type=int,
#       default=2000,
#       help="Number of training steps."
#   )
#   parser.add_argument(
#       "--train_data",
#       type=str,
#       default="",
#       help="Path to the training data."
#   )
#   parser.add_argument(
#       "--test_data",
#       type=str,
#       default="",
#       help="Path to the test data."
#   )
#   FLAGS, unparsed = parser.parse_known_args()
#   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)


In [25]:
TRAIN_FILE = 'D:\Study\Ostfold\MachineLearning\git\data\\splitted_data.train'
VAL_FILE = 'D:\Study\Ostfold\MachineLearning\git\data\\splitted_data.val'
TEST_FILE = 'D:\Study\Ostfold\MachineLearning\git\data\\splitted_data.test'

In [26]:
train_and_eval("", "deep", 10000, "", "")

Training data is downloaded to C:\Users\ADMIN\AppData\Local\Temp\tmp3o52_0t6
Test data is downloaded to C:\Users\ADMIN\AppData\Local\Temp\tmpixtersyb
INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_num_ps_replicas': 0, '_keep_checkpoint_every_n_hours': 10000, '_master': '', '_task_id': 0, '_save_checkpoints_secs': 600, '_keep_checkpoint_max': 5, '_model_dir': 'C:\\Users\\ADMIN\\AppData\\Local\\Temp\\tmpiede82s_', '_task_type': 'worker', '_log_step_count_steps': 100, '_service': None, '_save_checkpoints_steps': None, '_session_config': None, '_tf_random_seed': 1234, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000274D0413E10>, '_num_worker_replicas': 1}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\ADMIN\AppData\Local\Temp\tmpiede82s_\model.ckpt.
INFO:tensorflow:step = 1, loss = 876.6583
INFO:tensorflow:global_step/sec: 164.737
INFO:tensorflow:step = 101, loss = 40.2

INFO:tensorflow:global_step/sec: 198.923
INFO:tensorflow:step = 7501, loss = 35.709274 (0.510 sec)
INFO:tensorflow:global_step/sec: 197.871
INFO:tensorflow:step = 7601, loss = 28.936262 (0.495 sec)
INFO:tensorflow:global_step/sec: 197.158
INFO:tensorflow:step = 7701, loss = 31.582924 (0.506 sec)
INFO:tensorflow:global_step/sec: 213.027
INFO:tensorflow:step = 7801, loss = 31.568047 (0.471 sec)
INFO:tensorflow:global_step/sec: 199.318
INFO:tensorflow:step = 7901, loss = 24.28786 (0.503 sec)
INFO:tensorflow:global_step/sec: 170.148
INFO:tensorflow:step = 8001, loss = 34.43012 (0.586 sec)
INFO:tensorflow:global_step/sec: 217.641
INFO:tensorflow:step = 8101, loss = 30.641726 (0.471 sec)
INFO:tensorflow:global_step/sec: 182.36
INFO:tensorflow:step = 8201, loss = 37.40615 (0.538 sec)
INFO:tensorflow:global_step/sec: 214.168
INFO:tensorflow:step = 8301, loss = 35.108215 (0.472 sec)
INFO:tensorflow:global_step/sec: 218.009
INFO:tensorflow:step = 8401, loss = 27.450172 (0.467 sec)
INFO:tensorflo

In [28]:
train_and_eval("", "wide_n_deep", 2000, TRAIN_FILE, TEST_FILE)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_num_ps_replicas': 0, '_task_id': 0, '_keep_checkpoint_every_n_hours': 10000, '_num_worker_replicas': 1, '_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001B979C74AC8>, '_tf_random_seed': None, '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_model_dir': 'C:\\Users\\ADMIN\\AppData\\Local\\Temp\\tmp_o07q5nm', '_save_summary_steps': 100, '_keep_checkpoint_max': 5, '_service': None, '_is_chief': True, '_task_type': 'worker'}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\ADMIN\AppData\Local\Temp\tmp_o07q5nm\model.ckpt.
INFO:tensorflow:step = 1, loss = 3060.7476
INFO:tensorflow:global_step/sec: 139.723
INFO:tensorflow:step = 101, loss = 53.771553 (0.731 sec)
INFO:tensorflow:global_step/sec: 204.522
INFO:tensorflow:step = 201, loss = 50.86369 (0.486 s