## pima-indians-diabetes

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
diabetes= pd.read_csv("pima-indians-diabetes.csv")

In [8]:
diabetes.head()

Unnamed: 0,Number_pregnant,Glucose_concentration,Blood_pressure,Triceps,Insulin,BMI,Pedigree,Age,Class,Group
0,6,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,50,1,B
1,1,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,31,0,C
2,8,0.919598,0.52459,0.0,0.0,0.347243,0.253629,32,1,B
3,1,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,21,0,B
4,0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,33,1,C


In [9]:
diabetes.columns

Index(['Number_pregnant', 'Glucose_concentration', 'Blood_pressure', 'Triceps',
       'Insulin', 'BMI', 'Pedigree', 'Age', 'Class', 'Group'],
      dtype='object')

In [None]:
# Normalize

In [10]:
cols_to_norms= ['Number_pregnant', 'Glucose_concentration', 'Blood_pressure', 'Triceps',
       'Insulin', 'BMI', 'Pedigree']

In [11]:
diabetes[cols_to_norms]=diabetes[cols_to_norms].apply(lambda x:(x-x.min())/(x.max()-x.min()))

In [12]:
diabetes.head()

Unnamed: 0,Number_pregnant,Glucose_concentration,Blood_pressure,Triceps,Insulin,BMI,Pedigree,Age,Class,Group
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,50,1,B
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,31,0,C
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,32,1,B
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,21,0,B
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,33,1,C


### Feature Columns

In [15]:
import tensorflow as tf

*** Continuous variables

Number of times pregnant
Plasma glucose concentration a 2 hours in an oral glucose tolerance test
Diastolic blood pressure (mm Hg)
Triceps skin fold thickness (mm)
2-Hour serum insulin (mu U/ml)
Body mass index (weight in kg/(height in m)^2)
Diabetes pedigree function

In [16]:
num_preg = tf.feature_column.numeric_column('Number_pregnant')
plasma_gluc = tf.feature_column.numeric_column('Glucose_concentration')
dias_press = tf.feature_column.numeric_column('Blood_pressure')
tricep = tf.feature_column.numeric_column('Triceps')
insulin = tf.feature_column.numeric_column('Insulin')
bmi = tf.feature_column.numeric_column('BMI')
diabetes_pedigree = tf.feature_column.numeric_column('Pedigree')
age = tf.feature_column.numeric_column('Age')


*** Categorical Features

In [17]:
assigned_group = tf.feature_column.categorical_column_with_vocabulary_list('Group',['A','B','C','D'])
# Alternative
# assigned_group = tf.feature_column.categorical_column_with_hash_bucket('Group', hash_bucket_size=10)

** Converting continous variables to categorical

In [18]:
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[20,30,40,50,60,70,80])

*** Putting all of them together

In [19]:
feat_cols = [num_preg ,plasma_gluc,dias_press ,tricep ,insulin,bmi,diabetes_pedigree ,assigned_group, age_buckets]

*** Train Test split

In [20]:
x_data = diabetes.drop('Class',axis=1)

In [21]:
labels = diabetes['Class']

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(x_data,labels,test_size=0.33, random_state=101)

In [24]:
input_func= tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=10,num_epochs=1000,shuffle=True)

In [25]:
model=tf.estimator.LinearClassifier(feature_columns=feat_cols,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/jr/44cwffjj7nj68zjg2ngbdmjw0000gn/T/tmpyn6el86b', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [26]:
model.train(input_fn=input_func,steps=1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/jr/44cwffjj7nj68zjg2ngbdmjw0000gn/T/tmpyn6el86b/model.ckpt.
INFO:tensorflow:loss = 6.93147, step = 1
INFO:tensorflow:global_step/sec: 199.007
INFO:tensorflow:loss = 5.96017, step = 101 (0.504 sec)
INFO:tensorflow:global_step/sec: 215.474
INFO:tensorflow:loss = 6.33133, step = 201 (0.464 sec)
INFO:tensorflow:global_step/sec: 233.789
INFO:tensorflow:loss = 6.01676, step = 301 (0.427 sec)
INFO:tensorflow:global_step/sec: 232.495
INFO:tensorflow:loss = 3.4962, step = 401 (0.430 sec)
INFO:tensorflow:global_step/sec: 215.338
INFO:tensorflow:loss = 6.57672, step = 501 (0.465 sec)
INFO:tensorflow:global_step/sec: 216.738
INFO:tensorflow:loss = 7.39358, step = 601 (0.461 sec)
INFO:tensorflow:global_step/sec: 216.908
INFO:tensorflow:loss = 8.57469, step = 701 (0.461 sec)
INFO:tensorflow:global_step/sec: 213.484
INFO:tensorflow:loss = 7.64384, step = 801 (0.469 sec)
INFO:tensorflow:global_step/s

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x124914da0>

In [27]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      y=y_test,
      batch_size=10,
      num_epochs=1,
      shuffle=False)

In [28]:
results = model.evaluate(eval_input_func)

INFO:tensorflow:Starting evaluation at 2017-11-11-04:14:13
INFO:tensorflow:Restoring parameters from /var/folders/jr/44cwffjj7nj68zjg2ngbdmjw0000gn/T/tmpyn6el86b/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2017-11-11-04:14:14
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.732283, accuracy_baseline = 0.65748, auc = 0.79183, auc_precision_recall = 0.636262, average_loss = 0.527603, global_step = 1000, label/mean = 0.34252, loss = 5.15427, prediction/mean = 0.356865


In [29]:
results

{'accuracy': 0.73228347,
 'accuracy_baseline': 0.65748036,
 'auc': 0.79183012,
 'auc_precision_recall': 0.63626182,
 'average_loss': 0.52760297,
 'global_step': 1000,
 'label/mean': 0.34251967,
 'loss': 5.1542749,
 'prediction/mean': 0.35686544}

In [30]:
pred_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      batch_size=10,
      num_epochs=1,
      shuffle=False)

In [31]:
# Predictions is a generator! 
predictions = model.predict(pred_input_func)

In [33]:
predictions

<generator object Estimator.predict at 0x1248cdfc0>

In [34]:
list(predictions)

INFO:tensorflow:Restoring parameters from /var/folders/jr/44cwffjj7nj68zjg2ngbdmjw0000gn/T/tmpyn6el86b/model.ckpt-1000


[{'class_ids': array([1]),
  'classes': array([b'1'], dtype=object),
  'logistic': array([ 0.51201332], dtype=float32),
  'logits': array([ 0.04806244], dtype=float32),
  'probabilities': array([ 0.48798671,  0.51201332], dtype=float32)},
 {'class_ids': array([1]),
  'classes': array([b'1'], dtype=object),
  'logistic': array([ 0.62062663], dtype=float32),
  'logits': array([ 0.49220884], dtype=float32),
  'probabilities': array([ 0.37937337,  0.62062663], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'logistic': array([ 0.37077838], dtype=float32),
  'logits': array([-0.52887905], dtype=float32),
  'probabilities': array([ 0.62922162,  0.37077835], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'logistic': array([ 0.3067551], dtype=float32),
  'logits': array([-0.8153336], dtype=float32),
  'probabilities': array([ 0.69324493,  0.30675513], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array(

** Dense NN classifier

In [35]:
dnn_model = tf.estimator.DNNClassifier(hidden_units=[10,10,10],feature_columns=feat_cols,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/jr/44cwffjj7nj68zjg2ngbdmjw0000gn/T/tmpm3a7c0y4', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [36]:
embedded_group_column = tf.feature_column.embedding_column(assigned_group, dimension=4)

# pass categorical column

In [37]:
feat_cols = [num_preg ,plasma_gluc,dias_press ,tricep ,insulin,bmi,diabetes_pedigree ,embedded_group_column, age_buckets]

In [None]:
# Great now, no error

In [38]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=10,num_epochs=1000,shuffle=True)

In [39]:
dnn_model = tf.estimator.DNNClassifier(hidden_units=[10,10,10],feature_columns=feat_cols,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/jr/44cwffjj7nj68zjg2ngbdmjw0000gn/T/tmp9ms57z7q', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [40]:
dnn_model.train(input_fn=input_func,steps=1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/jr/44cwffjj7nj68zjg2ngbdmjw0000gn/T/tmp9ms57z7q/model.ckpt.
INFO:tensorflow:loss = 7.14219, step = 1
INFO:tensorflow:global_step/sec: 239.92
INFO:tensorflow:loss = 7.9233, step = 101 (0.419 sec)
INFO:tensorflow:global_step/sec: 240.686
INFO:tensorflow:loss = 8.63735, step = 201 (0.415 sec)
INFO:tensorflow:global_step/sec: 270.659
INFO:tensorflow:loss = 3.71347, step = 301 (0.372 sec)
INFO:tensorflow:global_step/sec: 210.038
INFO:tensorflow:loss = 7.0403, step = 401 (0.474 sec)
INFO:tensorflow:global_step/sec: 265.115
INFO:tensorflow:loss = 5.50686, step = 501 (0.377 sec)
INFO:tensorflow:global_step/sec: 250.362
INFO:tensorflow:loss = 6.80462, step = 601 (0.402 sec)
INFO:tensorflow:global_step/sec: 241.732
INFO:tensorflow:loss = 4.75308, step = 701 (0.411 sec)
INFO:tensorflow:global_step/sec: 236.274
INFO:tensorflow:loss = 5.2057, step = 801 (0.424 sec)
INFO:tensorflow:global_step/sec:

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1117bf278>

In [41]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      y=y_test,
      batch_size=10,
      num_epochs=1,
      shuffle=False)

In [42]:
dnn_model.evaluate(eval_input_func)

INFO:tensorflow:Starting evaluation at 2017-11-11-04:19:23
INFO:tensorflow:Restoring parameters from /var/folders/jr/44cwffjj7nj68zjg2ngbdmjw0000gn/T/tmp9ms57z7q/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2017-11-11-04:19:24
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.704724, accuracy_baseline = 0.65748, auc = 0.818535, auc_precision_recall = 0.678635, average_loss = 0.513281, global_step = 1000, label/mean = 0.34252, loss = 5.01436, prediction/mean = 0.394485


{'accuracy': 0.70472443,
 'accuracy_baseline': 0.65748036,
 'auc': 0.81853533,
 'auc_precision_recall': 0.67863476,
 'average_loss': 0.51328057,
 'global_step': 1000,
 'label/mean': 0.34251967,
 'loss': 5.0143561,
 'prediction/mean': 0.39448488}