In [114]:
import numpy as np # fast standard math capabilities
import pandas as pd # easy data handling
from sklearn.model_selection import train_test_split # scikit-learn for ML
import tensorflow as tf 
import matplotlib.pyplot as plt # Useful for plotting
import seaborn as sns # Advanced plots

import edward as ed 
from edward.models import Normal, Bernoulli, Empirical
from edward.inferences import MetropolisHastings

In [115]:
# Read in the csv data using pandas
bank_choice = pd.read_csv("/Users/hauptjoh/Downloads/bank_choice.csv")

In [116]:
choice_train, choice_test = train_test_split(bank_choice, test_size = 0.3, random_state = 123)

In [117]:
y_train = choice_train[["choice"]].values.flatten()
X_train = choice_train.drop(["choice", "id"], axis = 1).values

y_test = choice_test[["choice"]].values.flatten()
X_test = choice_test.drop(["choice", "id"], axis = 1).values

In [118]:
N = y_train.shape[0] # Number of observations
D = X_train.shape[1] # Number of variables

## Benchmark

In [119]:
# Logistic regression with scikit-learn
from sklearn import linear_model
logit = linear_model.LogisticRegression(penalty="l2", C=1)
logit.fit(X=X_train, y=y_train)
# Print out the estimated coefficients
print(logit.coef_)

[[ 1.59500819  3.10176761  2.01029561 -0.05008957 -0.47475756 -0.59287675
   1.27482891  2.43007257 -0.09189518 -1.33641466  0.8876274   1.51297273
   0.58670077  1.90118752]]


## Probit model

In [120]:
# If things go wrong, it's useful to reset the graph and start fresh
# without having to restart the kernel
tf.reset_default_graph()

In [121]:
# Set the random number generator seed for replicable results
ed.set_seed(42) # 

In [122]:
# Tensorflow placeholder for matrix of float numbers with dimensions [N, D]
X = tf.placeholder(tf.float32, [N, D])

In [123]:
w = Normal(loc=tf.zeros(D), scale=1.0 * tf.ones(D))
b = Normal(loc=tf.zeros([]), scale=1.0 * tf.ones([]))

A Probit model works by assuming that the values are normally distributed (imagine utility) and that the decision depends on the sign of the value. For example, the utility of a product to the customer depends on the attributes of the customer, product and situation expressed in X. If the utility is >0, then we observe a purchase. If it is <0, then the customer decides not to buy (or buy something else).

In [124]:
# TODO: This looks intuitive but doesn't work. I think the tf.greater() breaks the code
# Doing inference on z directly w.r.t y_train gives approx. same AUC as logit
z = Normal(loc = ed.dot(X,w) +b, scale=1.)
y = tf.greater(z, 0)

In [113]:
# Use a different activation function here, see Bishop 4.3 for details
y = Bernoulli(probs = 0.5 * (1 + tf.erf(ed.dot(X,w) / tf.sqrt(2.))))

In [102]:
proposal_w = Normal(loc=w, scale = 0.05)
proposal_b = Normal(loc=b, scale = 0.05)

In [103]:
T = 10000 # Number of iterations
qw = Empirical(tf.Variable(tf.zeros([T,D])))
qb = Empirical(tf.Variable(tf.zeros([T,])))

In [104]:
inference = MetropolisHastings({w:qw, b:qb}, 
                               {w:proposal_w, b:proposal_b},
                               data={y:y_train, X:X_train})

  not np.issubdtype(value.dtype, np.float) and \
  not np.issubdtype(value.dtype, np.int) and \


In [105]:
inference.run(n_print=500)

10000/10000 [100%] ██████████████████████████████ Elapsed: 19s | Acceptance Rate: 0.005


## Model criticism

The coefficients of the logit and probit model are not the same [TODO: Should they be? No, need to be transformed by a factor of 1.6?].

In [106]:
b_est = np.median(qb.sample(10000).eval())
print(b_est)
print(logit.intercept_)

0.47404978
[-0.04339837]


In [107]:
w_est = np.median(qw.sample(10000).eval(),axis=0)
print(w_est)
print(logit.coef_/1.6)

[ 0.3088462   0.6198552   0.38394704 -0.01282695 -0.10555348 -0.11564795
  0.23597543  0.4708846  -0.02196553 -0.2759533   0.1495438   0.24991529
  0.1110389   0.33218917]
[[ 0.99688012  1.93860476  1.25643476 -0.03130598 -0.29672348 -0.37054797
   0.79676807  1.51879536 -0.05743449 -0.83525916  0.55476713  0.94560795
   0.36668798  1.1882422 ]]


We calculate the z-scores

In [108]:
Z = ed.dot(tf.cast(X_test, tf.float32), w_est) + b_est

And put them in the Inverse Normal. For efficiency, we put them in the erf (error-function), which has no relation to the loss function.

In [109]:
pred = 0.5 * (1 + tf.erf(Z/ tf.sqrt(2.))).eval()

In [110]:
pred.shape

(4440,)

Check if we get predictions that are at least similar to logit

In [111]:
np.corrcoef(pred, logit.predict_proba(X_test)[:,1])

array([[1.        , 0.98620829],
       [0.98620829, 1.        ]])

Logit and Probit achieve a very similar AUC on the test data. This is expected, since the models are not very different.

In [112]:
from sklearn.metrics import confusion_matrix, roc_auc_score
print(roc_auc_score(y_test, pred))
print(roc_auc_score(y_test, logit.predict_proba(X_test)[:,1]))

0.791640070602644
0.7924568382159967
