# Jonathan Halverson
# Thursday, December 14, 2017
# Simple check with Tensorflow and class weights

This notebook was created because I was having trouble getting tensorflow to do logistic regression on a pair of hand-written digits. There is less preprocessing when using the breast cancer dataset so that was used here. The big takeaway is that when the number of features is large, the coefficients differ markedly between Sklearn, the self-written code and Tensorflow while the accuracy is the same for all. This is somewhat surprising since the optimization space is guaranteed to be convex since the loss function is log loss.

In [1]:
import numpy as np
np.set_printoptions(threshold=np.nan)

In [2]:
from sklearn.datasets import load_breast_cancer, load_digits
from sklearn.preprocessing import StandardScaler

In [3]:
X = load_breast_cancer().data
y = load_breast_cancer().target
#X = load_digits().data
#y = load_digits().target
#X = X[(y == 0) | (y == 1)]
#y = y[(y == 0) | (y == 1)]

In [4]:
X.shape

(569, 30)

In [5]:
np.bincount(y)

array([212, 357])

There is a class imbalance so we will weight the cost function:

In [6]:
w = y.size / (2.0 * np.bincount(y))

Note that the tensorflow code below did not converge until the data were standardized.

In [7]:
X_std = StandardScaler().fit_transform(X)
from sklearn.decomposition import PCA
X_std = PCA(n_components=5).fit_transform(X_std)

Applying PCA makes the tensorflow model much easier to optimize.

### Sklearn

In [8]:
from sklearn.linear_model import LogisticRegression

wts = {0:w[0], 1:w[1]}
#lr = LogisticRegression(C=1e12, class_weight=wts)
#lr = LogisticRegression(C=1e12, class_weight='balanced')
lr = LogisticRegression(C=1e12, class_weight=None, tol=1e-7, verbose=10)

print lr.fit(X_std, y).score(X_std, y)

[LibLinear]0.977152899824


In [9]:
wts

{0: 1.3419811320754718, 1: 0.79691876750700286}

In [10]:
ep = 1e-7
y_pred_prob = lr.fit(X_std, y).predict_proba(X_std)
loss = -(np.log(y_pred_prob[y == 0][:,0] + ep).sum() + np.log(y_pred_prob[y == 1][:,1] + ep).sum()) / y_pred_prob.shape[0]
loss

[LibLinear]

0.073283324890973389

In [11]:
from sklearn.metrics import log_loss

log_loss(y, y_pred_prob)

0.07328354664840192

In [12]:
np.append(lr.intercept_, lr.coef_[0])

array([ 0.4137167 , -2.89107656,  1.59169636,  0.4984996 ,  0.78415435,
        1.28067847])

### Self-written version gradient descent

In [13]:
from scipy.special import expit
from sklearn.metrics import accuracy_score

In [14]:
learning_rate = 1.0
epochs = 1000000

In [15]:
theta = np.random.randn(X_std.shape[1] + 1)
X_bias = np.c_[np.ones((X_std.shape[0], 1)), X_std]

In [16]:
for epoch in xrange(epochs + 1):
     y_prob = expit(np.dot(X_bias, theta))
     errors = y_prob - y
     gradients = (1.0 / X_bias.shape[0]) * np.dot(X_bias.T, errors)
     theta = theta - learning_rate * gradients
     if not (epoch % 100000): print epoch, log_loss(y, y_prob)
print(theta)
print "accuracy =", accuracy_score(y, y_prob > 0.5)

0 1.13948203498
100000 0.0732835466484
200000 0.0732835466484
300000 0.0732835466484
400000 0.0732835466484
500000 0.0732835466484
600000 0.0732835466484
700000 0.0732835466484
800000 0.0732835466484
900000 0.0732835466484
1000000 0.0732835466484
[ 0.41371574 -2.89107853  1.59169789  0.49849996  0.78415443  1.28068096]
accuracy = 0.977152899824


### Tensorflow

In [17]:
import tensorflow as tf

In [18]:
tf.reset_default_graph()

In [19]:
my_weights = y.copy().astype(np.float64)
#my_weights[y == 0] = w[0]
#my_weights[y == 1] = w[1]
my_weights = np.ones(y.size, dtype=np.float64)
my_weights = tf.constant(my_weights.reshape(-1, 1))

In [20]:
X = tf.constant(X_std, dtype=tf.float64)
y = tf.constant(y.reshape(-1, 1), dtype=tf.int32)

In [21]:
k_init = tf.truncated_normal_initializer(mean=0.0, stddev=1.0, seed=42, dtype=tf.float64)
prob_positive = tf.layers.dense(X, units=1, activation=tf.sigmoid, kernel_initializer=k_init, name='single_neuron')
loss = tf.losses.log_loss(labels=y, predictions=prob_positive, weights=my_weights, epsilon=1e-7)

In [22]:
# one could code LR manually using gradient descent and compare the code below to that for
# each step (see if loss is the same)
#optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
#optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.975, use_nesterov=True)
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
training_op = optimizer.minimize(loss)

In [23]:
# The accuracy metric creates a local variables. It also stores a running average. Try
# evaluating accuracy throughout the minimization and then accuracy2 at the very end (they
# will differ).
#accuracy, accuracy_op = tf.metrics.accuracy(labels=y, predictions=tf.round(prob_positive))
#accuracy2, accuracy_op2 = tf.metrics.accuracy(labels=y, predictions=tf.round(prob_positive))

# all three definitions give the same result
acc = tf.reduce_sum(tf.cast(tf.equal(y, tf.cast(prob_positive + 0.5, tf.int32)), tf.int32))
acc2 = tf.reduce_sum(tf.cast(tf.equal(y, tf.cast(tf.round(prob_positive), tf.int32)), tf.int32))
acc3 = tf.reduce_sum(tf.cast(tf.equal(y, tf.cast(prob_positive > 0.5, tf.int32)), tf.int32))

tot = tf.cast(tf.size(y), tf.float64)
acc = tf.cast(acc, tf.float64) / tot
acc2 = tf.cast(acc2, tf.float64) / tot
acc3 = tf.cast(acc3, tf.float64) / tot

In [24]:
init = tf.global_variables_initializer()
#lcl = tf.local_variables_initializer() # accuracy metric creates local variables

In [25]:
n_epochs = 50000

In [26]:
with tf.Session() as sess:
     init.run()
     #lcl.run()
     sess.run(training_op)
     for epoch in xrange(n_epochs + 1): 
          sess.run(training_op)
          if not (epoch % 10000): print epoch, "Loss:", loss.eval(), "Acc:", acc.eval(), acc2.eval(), acc3.eval()
     print tf.get_default_graph().get_tensor_by_name('single_neuron/kernel:0').eval()
     print tf.get_default_graph().get_tensor_by_name('single_neuron/bias:0').eval()

0 Loss: 2.43958 Acc: 0.203866432337 0.203866432337 0.203866432337
10000 Loss: 0.0740872 Acc: 0.978910369069 0.978910369069 0.978910369069
20000 Loss: 0.0732833 Acc: 0.977152899824 0.977152899824 0.977152899824
30000 Loss: 0.0732833 Acc: 0.977152899824 0.977152899824 0.977152899824
40000 Loss: 0.0732833 Acc: 0.977152899824 0.977152899824 0.977152899824
50000 Loss: 0.0732834 Acc: 0.977152899824 0.977152899824 0.977152899824
[[-2.8911071 ]
 [ 1.59170036]
 [ 0.4985112 ]
 [ 0.78414355]
 [ 1.28070066]]
[ 0.41371956]
