In [1]:
import pandas as pd

In [3]:
census_data = pd.read_csv('census_data.csv')

In [4]:
census_data.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
y = census_data['income_bracket']
X = census_data.drop('income_bracket', axis=1)

In [9]:
# to convert data from cat to num
y = pd.get_dummies(y,drop_first=True)

In [10]:
y.head()

Unnamed: 0,>50K
0,0
1,0
2,0
3,0
4,0


### Train Test Split Data

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

### Featiure Columns for TF.Estimator

In [13]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


#### Feature Columns for Categorical Values

In [14]:
gender = tf.feature_column.categorical_column_with_vocabulary_list('gender', ['Female','Male'])
occupation = tf.feature_column.categorical_column_with_hash_bucket('occupation', hash_bucket_size=1000)

In [15]:
marital_status = tf.feature_column.categorical_column_with_hash_bucket('marital_status', hash_bucket_size=1000)
relationship = tf.feature_column.categorical_column_with_hash_bucket('relationship', hash_bucket_size=1000)
education = tf.feature_column.categorical_column_with_hash_bucket('education', hash_bucket_size=1000)
workclass = tf.feature_column.categorical_column_with_hash_bucket('workclass', hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket('native_country', hash_bucket_size=1000)

#### Feature Columns for Contr. Values

In [16]:
age = tf.feature_column.numeric_column('age')
education_num = tf.feature_column.numeric_column('education_num')
capital_loss = tf.feature_column.numeric_column('capital_loss')
capital_gain = tf.feature_column.numeric_column('capital_gain')
hours_per_week = tf.feature_column.numeric_column('hours_per_week')

### Put all the feature cols in one list

In [17]:
feat_cols = [gender, occupation, marital_status, relationship, education, workclass, native_country,
            age,education_num, capital_gain, capital_loss, hours_per_week]

In [19]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=100, 
                                                num_epochs=None, shuffle=True)
## IF WE WANT TO USE DNNCLASSIFIER, 
## CATEGORICAL VALUES MUST BE CONVERTED TO EMBEDDED CATEGORICAL VALUES 
## INSTEAD OF REGULAR CATEGORICAL VALUES AND INPUT FUNCT MUST BE EMBEDDING FUNCTION
model = tf.estimator.LinearClassifier(feature_columns=feat_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': 'worker', '_num_worker_replicas': 1, '_save_checkpoints_steps': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0add4a1ac8>, '_log_step_count_steps': 100, '_session_config': None, '_save_summary_steps': 100, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_tf_random_seed': None, '_master': '', '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/tmpwu8klf6e', '_save_checkpoints_secs': 600, '_service': None, '_is_chief': True}


In [21]:
model.train(input_fn=input_func, steps=10000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpwu8klf6e/model.ckpt.
INFO:tensorflow:step = 1, loss = 69.31472
INFO:tensorflow:global_step/sec: 102.759
INFO:tensorflow:step = 101, loss = 586.417 (0.974 sec)
INFO:tensorflow:global_step/sec: 102.566
INFO:tensorflow:step = 201, loss = 1681.87 (0.974 sec)
INFO:tensorflow:global_step/sec: 62.7487
INFO:tensorflow:step = 301, loss = 538.2452 (1.596 sec)
INFO:tensorflow:global_step/sec: 52.9694
INFO:tensorflow:step = 401, loss = 302.0945 (1.887 sec)
INFO:tensorflow:global_step/sec: 99.7789
INFO:tensorflow:step = 501, loss = 239.11879 (1.000 sec)
INFO:tensorflow:global_step/sec: 104.66
INFO:tensorflow:step = 601, loss = 57.022713 (0.955 sec)
INFO:tensorflow:global_step/sec: 103.416
INFO:tensorflow:step = 701, loss = 67.836525 (0.969 sec)
INFO:tensorflow:global_step/sec: 102.3
INFO:tensorflow:step = 801, loss = 199.82176 (0.976 sec)
INFO:tensorflow:global_step/sec: 96.3837
INFO:tensorflow:step = 

INFO:tensorflow:global_step/sec: 83.8673
INFO:tensorflow:step = 8301, loss = 155.25545 (1.194 sec)
INFO:tensorflow:global_step/sec: 62.0124
INFO:tensorflow:step = 8401, loss = 33.040638 (1.612 sec)
INFO:tensorflow:global_step/sec: 62.3836
INFO:tensorflow:step = 8501, loss = 33.393566 (1.605 sec)
INFO:tensorflow:global_step/sec: 62.4901
INFO:tensorflow:step = 8601, loss = 33.01105 (1.598 sec)
INFO:tensorflow:global_step/sec: 64.3008
INFO:tensorflow:step = 8701, loss = 43.37369 (1.555 sec)
INFO:tensorflow:global_step/sec: 64.1799
INFO:tensorflow:step = 8801, loss = 29.666206 (1.558 sec)
INFO:tensorflow:global_step/sec: 62.0425
INFO:tensorflow:step = 8901, loss = 28.989693 (1.612 sec)
INFO:tensorflow:global_step/sec: 69.8454
INFO:tensorflow:step = 9001, loss = 105.40893 (1.431 sec)
INFO:tensorflow:global_step/sec: 82.3345
INFO:tensorflow:step = 9101, loss = 29.89521 (1.214 sec)
INFO:tensorflow:global_step/sec: 84.0684
INFO:tensorflow:step = 9201, loss = 30.909937 (1.191 sec)
INFO:tensorfl

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x7f0a550d5a90>

### Prediction

In [23]:
pred_func = tf.estimator.inputs.pandas_input_fn(x=X_test,batch_size=len(X_test), num_epochs=1, shuffle=False)
predictions = list(model.predict(pred_func))

INFO:tensorflow:Restoring parameters from /tmp/tmpwu8klf6e/model.ckpt-10000


In [24]:
predictions

[{'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'logistic': array([0.27377966], dtype=float32),
  'logits': array([-0.97552997], dtype=float32),
  'probabilities': array([0.72622037, 0.27377966], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'logistic': array([0.00011965], dtype=float32),
  'logits': array([-9.0307865], dtype=float32),
  'probabilities': array([9.9988031e-01, 1.1965401e-04], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'logistic': array([0.3782904], dtype=float32),
  'logits': array([-0.4968108], dtype=float32),
  'probabilities': array([0.6217096, 0.3782904], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'logistic': array([2.3862693e-05], dtype=float32),
  'logits': array([-10.64317], dtype=float32),
  'probabilities': array([9.9997616e-01, 2.3862693e-05], dtype=float32)},
 {'class_ids': array([1]),
  'classes': array([b

In [25]:
preds = []
for prediction in predictions:
    preds.append(prediction['class_ids'][0])

In [26]:
from sklearn.metrics import classification_report

In [28]:
print(classification_report(y_test, preds))

             precision    recall  f1-score   support

          0       0.89      0.92      0.90      7436
          1       0.71      0.62      0.66      2333

avg / total       0.84      0.85      0.84      9769

