## Binary Clossification Example

#### Step 1) Data importing

In [2]:
import tensorflow as tf
import pandas as pd

In [3]:
## Define path data
COLUMNS = ['age','workclass', 'fnlwgt', 'education', 'education_num', 'marital',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_week', 'native_country', 'label']
PATH = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
PATH_test = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

In [4]:
# already ddivided between a trian and test set
df_train = pd.read_csv(PATH, skipinitialspace=True, names = COLUMNS, index_col=False)
df_test = pd.read_csv(PATH_test,skiprows = 1, skipinitialspace=True, names = COLUMNS, index_col=False)

In [5]:
print(df_train.shape, df_test.shape)
print(df_train.dtypes)

(32561, 15) (16281, 15)
age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital           object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_week         int64
native_country    object
label             object
dtype: object


In [6]:
# we need to convert the value to numerical 
label = {'<=50K':0, '>50K': 1}
df_train.label = [label[item] for item in df_train.label]
label_t = {'<=50K.':0,'>50K.':1}
df_test.label = [label_t[item] for item in df_test.label]


In [7]:
print(df_train["label"].value_counts())
### the model will be correct in atleast 70% of the case
print(df_test["label"].value_counts())
## Unbalanced label
print(df_train.dtypes)


0    24720
1     7841
Name: label, dtype: int64
0    12435
1     3846
Name: label, dtype: int64
age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital           object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_week         int64
native_country    object
label              int64
dtype: object


#### Step 2) Data Conversion

In [8]:
## Add features to the bucket:
### Define continuous list
CONTI_FEATURES  = ['age', 'fnlwgt','capital_gain', 'education_num', 'capital_loss', 'hours_week']
### Define the categorical list
CATE_FEATURES = ['workclass', 'education', 'marital', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [9]:
def print_transformation(feature = "age", continuous = True, size = 2):
    # X = fc.numeric_column(feature)
    ## Create feature name
    feature_names = [feature]
    
    ## Create dict with the data
    d = dict(zip(feature_names, [df_train[feature]]))
    
    ## Convert age
    if continuous == True:
        c = tf.feature_column.numeric_column(feature)
        feature_columns = [c]
    else:
        c = tf.feature_column.categorical_column_with_hash_bucket(feature, hash_bucket_size=size)
        c_indicator = tf.feature_column.indicator_column(c)
        feature_columns = [c_indicator]
    ## Use input_layer to print the value
    input_layer = tf.feature_column.input_layer(
        features=d,
        feature_columns=feature_columns
    )
    ## Create lookup table
    zero = tf.constant(0, dtype=tf.float32)
    where = tf.not_equal(input_layer, zero)
    
    ## Return lookup table
    indices = tf.where(where)
    values = tf.gather_nd(input_layer, indices)
    
    # Initiate graph
    sess = tf.Session()
    
    # print value 
    print(sess.run(input_layer))
    
print_transformation(feature="age", continuous= True)
        

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[[39.]
 [50.]
 [38.]
 ...
 [58.]
 [22.]
 [52.]]


In [10]:
continuous_features = [tf.feature_column.numeric_column(k) for k in CONTI_FEATURES]

In [11]:
print_transformation(feature="sex", continuous=False, size = 2)

relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    'relationship', [
        'Husband', 'Not-in-family','Wife','Own-child','Unmarried','Other-relative'
    ]

)

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [1. 0.]
 [0. 1.]]


In [12]:
categorical_features = [tf.feature_column.categorical_column_with_hash_bucket(k, hash_bucket_size=1000) for k in CATE_FEATURES]

#### Step 3) Train the classifier

In [13]:
model = tf.estimator.LinearClassifier(
    n_classes=2,
    model_dir="ongoing/train",
    feature_columns=categorical_features+continuous_features
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'ongoing/train', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6a4ade28d0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [14]:
FEATURES_imp = ['age','workclass', 'education', 'education_num', 'marital',
                'occupation', 'relationship', 'race', 'sex', 'native_country', 'new']

def get_input_fn(data_set, num_epochs=None, n_batch = 128, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
       x=pd.DataFrame({k: data_set[k].values for k in FEATURES_imp}),
       y = pd.Series(data_set[LABEL].values),
       batch_size=n_batch,   
       num_epochs=num_epochs,
       shuffle=shuffle)

In [15]:
model.train(input_fn=get_input_fn(df_train,
                                 num_epochs=None,
                                 n_batch = 128,
                                 shuffle=False),
           steps=1000)

KeyError: 'new'

In [None]:
model.evaluate(input_fn=get_input_fn(df_test,
                                    num_epochs = 128,
                                    n_batch = 128,
                                    shuffle=False),
              steps=1000)

#### Step 4) Improve the model

In [12]:
def square_var(df_t, df_te, var_name="age"):
    df_t['new'] = df_t[var_name].pow(2)
    df_te['new'] = df_te[var_name].pow(2)
    return df_t , df_te

In [22]:
df_train_new, df_test_new = square_var(df_train, df_test, var_name="age")

NameError: name 'square_var' is not defined

In [14]:
print(df_train_new.shape, df_test_new.shape)

(32561, 16) (16281, 16)


In [15]:
CONTI_FEATURES_NEW  = ['age', 'fnlwgt','capital_gain', 'education_num', 'capital_loss', 'hours_week', 'new']
continuous_features_new = [tf.feature_column.numeric_column(k) for k in CONTI_FEATURES_NEW]

In [16]:
model_1 = tf.estimator.LinearClassifier(
    model_dir="ongoing/train1", 
    feature_columns=categorical_features+ continuous_features_new)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'ongoing/train1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3726978b70>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [26]:
FEATURES_NEW = ['age','workclass', 'fnlwgt', 'education', 'education_num', 'marital', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_week', 'native_country', 'new']
def get_input_fn(data_set, num_epochs=None, n_batch=128, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
        x=pd.DataFrame({k:data_set[k].values for k in FEATURES_NEW}),
        y = pd.Series(data_set[LABEL].values),
        batch_size=n_batch,
        num_epochs=num_epochs,
        shuffle=shuffle)

In [27]:
model_1.train(input_fn=get_input_fn(df_train,
                                   num_epochs=None,
                                   n_batch = 128,
                                   shuffle=False),
              steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into ongoing/train1/model.ckpt.
INFO:tensorflow:loss = 88.722855, step = 1
INFO:tensorflow:global_step/sec: 68.869
INFO:tensorflow:loss = 70077.66, step = 101 (1.453 sec)
INFO:tensorflow:global_step/sec: 116.44
INFO:tensorflow:loss = 49522.082, step = 201 (0.860 sec)
INFO:tensorflow:global_step/sec: 106.13
INFO:tensorflow:loss = 107120.57, step = 301 (0.943 sec)
INFO:tensorflow:global_step/sec: 100.417
INFO:tensorflow:loss = 12814.152, step = 401 (0.995 sec)
INFO:tensorflow:global_step/sec: 117.491
INFO:tensorflow:loss = 19573.898, step = 501 (0.851 sec)
INFO:tensorflow:global_step/sec: 111.676
INFO:tensorflow:loss = 26381.986, ste

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifier at 0x7f3726978710>

In [28]:
model_1.evaluate(input_fn=get_input_fn(df_test_new,
                                      num_epochs=1,
                                      n_batch =128,
                                      shuffle=False),
                steps=1000)

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-06-26T15:01:26Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ongoing/train1/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Finished evaluation at 2019-06-26-15:01:28
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.7920889, accuracy_baseline = 0.76377374, auc = 0.6060022, auc_precision_recall = 0.5424621, average_loss = 128.78566, global_step = 1000, label/mean = 0.23622628, loss = 16380.932, precision = 0.65627116, prediction/mean = 0.09059174, recall = 0.25169006
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: ongoing/train1/model.ck

{'accuracy': 0.7920889,
 'accuracy_baseline': 0.76377374,
 'auc': 0.6060022,
 'auc_precision_recall': 0.5424621,
 'average_loss': 128.78566,
 'label/mean': 0.23622628,
 'loss': 16380.932,
 'precision': 0.65627116,
 'prediction/mean': 0.09059174,
 'recall': 0.25169006,
 'global_step': 1000}

### Bucketization and interaction

In [17]:
# adding Bucketization to improve the learning for age value
age = tf.feature_column.numeric_column('age')
age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
)

In [18]:
# feature crossing
education_x_occupation = [tf.feature_column.crossed_column(
    ['education','occupation'], hash_bucket_size=1000)]
age_buckets_x_education_x_occupation = [tf.feature_column.crossed_column(
    [age_buckets, 'education','occupation'], hash_bucket_size=1000)]

In [19]:
base_columns = [
    age_buckets,
]

model_imp = tf.estimator.LinearClassifier(
    model_dir="ongoing/train3", 
    feature_columns=categorical_features+base_columns+education_x_occupation+age_buckets_x_education_x_occupation)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'ongoing/train3', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6a4ad89828>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [20]:
FEATURES = ['age','workclass', 'fnlwgt', 'education', 'education_num', 'marital', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_week', 'native_country']
LABEL= 'label'
def get_input_fn(data_set, num_epochs=None, n_batch = 128, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
       x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
       y = pd.Series(data_set[LABEL].values),
       batch_size=n_batch,   
       num_epochs=num_epochs,
       shuffle=shuffle)

In [21]:
model_imp.train(input_fn=get_input_fn(df_train_new, 
                                      num_epochs=None,
                                      n_batch = 128,
                                      shuffle=False),
                                      steps=1000)

NameError: name 'df_train_new' is not defined