## Feature Engineering using Tensorflow

source: https://github.com/tensorflow/workshops/blob/master/extras/archive/07_structured_data.ipynb

youtube: https://www.youtube.com/watch?v=d12ra3b_M-0&list=PLOU2XLYxmsIIuiBfYad6rFYQU_jL2ryal&index=9

In [2]:
from __future__ import print_function
import numpy as np
import pandas as pd
import tensorflow as tf
print('you have: ', tf.__version__)

you have:  2.1.0


In [5]:
#loading data
census_train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
census_test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
census_train_path = tf.keras.utils.get_file('census.train', census_train_url)
census_test_path = tf.keras.utils.get_file('census.test', census_test_url)

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test


In [6]:
#from names file, get the feature names
column_names = [
  'age', 'workclass', 'fnlwgt', 'education', 'education-num',
  'marital-status', 'occupation', 'relationship', 'race', 'gender',
  'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
  'income'
]

In [10]:
#reading using pandas
# Notes
# 1) We provide the header from above.
# 2) The test file has a line we want to disgard at the top, so we include the parameter 'skiprows=1'
census_train = pd.read_csv(census_train_path, index_col=False, names=column_names) 
census_test = pd.read_csv(census_test_path, skiprows=1, index_col=False, names=column_names) 

# Drop any rows that have missing elements
# Of course there are other ways to handle missing data, but we'll
# take the simplest approach here.
census_train = census_train.dropna(how="any", axis=0)
census_test = census_test.dropna(how="any", axis=0)

census_train.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [11]:
# Separate the label we want to predict into its own object 
# At the same time, we'll convert it into true/false to fix the formatting error
census_train_label = census_train.pop('income').apply(lambda x: ">50K" in x)
census_test_label = census_test.pop('income').apply(lambda x: ">50K" in x)

In [12]:
print ("Training examples: %d" % census_train.shape[0])
print ("Training labels: %d" % census_train_label.shape[0])
print()
print ("Test examples: %d" % census_test.shape[0])
print ("Test labels: %d" % census_test_label.shape[0])

Training examples: 32561
Training labels: 32561

Test examples: 16281
Test labels: 16281


In [15]:
census_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [16]:
census_train_label.head()

0    False
1    False
2    False
3    False
4    False
Name: income, dtype: bool

## Estimators and input functions

In [37]:
def create_train_input_fn():
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
    x = census_train,
    y = census_train_label,
    batch_size = 32,
    num_epochs = None, #repeat forever
    shuffle = True)
def create_test_input_fn():
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
    x = census_test,
    y = census_test_label,
    num_epochs = 1, #just one epoch
    shuffle = False) #don't need to shuffle

In [38]:
#feature engineering
#A list of feature columns we'll use to train the linear model
feature_columns = []

In [39]:
#first we use the raw numeric value of age
age = tf.feature_column.numeric_column('age')
feature_columns.append(age)

age_buckets = tf.feature_column.bucketized_column(
    age, 
    boundaries = [31,46,60,75,90]) #specify ranges
#or
#age_buckets = tf.feature_column.bucketized_column(
#   age, list(range(10))
#   )

In [40]:
#here's a categorical column
#we're specifying the possible values
education = tf.feature_column.categorical_column_with_vocabulary_list(
    "education", [
        "Bachelors", "HS-grad", "11th", "Masters", "9th",
        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
        "Preschool", "12th"      
    ])
feature_columns.append(education)

In [41]:
#A categorical feature with a possibly large number of values
#and the vocabulary not specified in advance
native_country = tf.feature_column.categorical_column_with_hash_bucket(
    'native-country', 1000)
feature_columns.append(native_country)

In [42]:
#now create a crossed column for age and education
age_cross_education = tf.feature_column.crossed_column(
    [age_buckets, education],
    hash_bucket_size = int(1e4)
)
feature_columns.append(age_cross_education)

In [43]:
#train a canned linear estimator
train_input_fn = create_train_input_fn()
estimator = tf.estimator.LinearClassifier(feature_columns, n_classes=2)
estimator.train(train_input_fn, steps = 1000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\amahbub\\AppData\\Local\\Temp\\1\\tmpu5b2w428', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x21356efa8c8>

In [44]:
#evaluate
test_input_fn = create_test_input_fn()
estimator.evaluate(test_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-07-01T09:22:45Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\amahbub\AppData\Local\Temp\1\tmpu5b2w428\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 1.13124s
INFO:tensorflow:Finished evaluation at 2020-07-01-09:22:46
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.76377374, accuracy_baseline = 0.76377374, auc = 0.6807592, auc_precision_recall = 0.35556182, average_loss = 0.4970679, global_step = 1000, label/mean = 0.23622628, loss = 0.4963402, precision = 0.0, prediction/mean = 0.20434779, recall = 0.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: C:\Users\amahbub\AppData\Local\Temp\1\tmpu5b2w428\model.ckpt-1000


{'accuracy': 0.76377374,
 'accuracy_baseline': 0.76377374,
 'auc': 0.6807592,
 'auc_precision_recall': 0.35556182,
 'average_loss': 0.4970679,
 'label/mean': 0.23622628,
 'loss': 0.4963402,
 'precision': 0.0,
 'prediction/mean': 0.20434779,
 'recall': 0.0,
 'global_step': 1000}

In [49]:
#prediction comparison
#reinitialize the input function
test_input_fn = create_test_input_fn()

predictions = estimator.predict(test_input_fn)
i = 0
for prediction in predictions:
    true_label = census_test_label[i]
    predicted_label = prediction['class_ids'][0]
    #print(prediction)
    print("Example %d. Actual: %d, Predicted: %d" % (i, true_label,
                                                    predicted_label))
    i += 1
    if i == 5: break

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\amahbub\AppData\Local\Temp\1\tmpu5b2w428\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Example 0. Actual: 0, Predicted: 0
Example 1. Actual: 0, Predicted: 0
Example 2. Actual: 1, Predicted: 0
Example 3. Actual: 1, Predicted: 0
Example 4. Actual: 0, Predicted: 0
