# Jonathan Halverson
# Thursday, December 7, 2017
# Logistic regression in Tensorflow, Sklearn and Spark

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_breast_cancer

In [3]:
bc = load_breast_cancer()

In [4]:
y = bc.target
df = pd.DataFrame(bc.data, columns=bc.feature_names.tolist())
df.head().T

Unnamed: 0,0,1,2,3,4
mean radius,17.99,20.57,19.69,11.42,20.29
mean texture,10.38,17.77,21.25,20.38,14.34
mean perimeter,122.8,132.9,130.0,77.58,135.1
mean area,1001.0,1326.0,1203.0,386.1,1297.0
mean smoothness,0.1184,0.08474,0.1096,0.1425,0.1003
mean compactness,0.2776,0.07864,0.1599,0.2839,0.1328
mean concavity,0.3001,0.0869,0.1974,0.2414,0.198
mean concave points,0.1471,0.07017,0.1279,0.1052,0.1043
mean symmetry,0.2419,0.1812,0.2069,0.2597,0.1809
mean fractal dimension,0.07871,0.05667,0.05999,0.09744,0.05883


We see that the classes are not balanced:

In [5]:
np.bincount(bc.target)

array([212, 357])

Let's standardize the features and then apply PCA:

In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [7]:
std_sc = StandardScaler()
X_std = std_sc.fit_transform(bc.data)

In [8]:
pca = PCA(n_components=5)
X_std_pca = pca.fit_transform(X_std)

In [9]:
np.mean(X_std_pca, axis=0), np.std(X_std_pca, axis=0)

(array([  1.26436647e-16,  -1.52192260e-16,   9.59981947e-17,
          4.05846027e-17,   1.25948851e-16]),
 array([ 3.64439401,  2.38565601,  1.67867477,  1.40735229,  1.28402903]))

### Sklearn

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
lr = LogisticRegression(C=1e6)
lr.fit(X_std_pca, y)

LogisticRegression(C=1000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [12]:
lr.coef_

array([[-2.89058131,  1.5914026 ,  0.49835312,  0.78400646,  1.28055342]])

In [13]:
lr.intercept_

array([ 0.41356369])

In [14]:
lr.score(X_std_pca, y)

0.97715289982425313

In [15]:
lr.n_iter_

array([8], dtype=int32)

### Tensorflow

In [16]:
import tensorflow as tf

In [17]:
tf.reset_default_graph()

In [18]:
y_true = bc.target.reshape(-1, 1)
X = tf.constant(X_std_pca, dtype=tf.float64, name="X")
y = tf.constant(y_true, dtype=tf.int64, name="y")

In [19]:
k_init = tf.truncated_normal_initializer(mean=0.0, stddev=1.0, seed=42)
prob_positive = tf.layers.dense(inputs=X, units=1, activation=tf.sigmoid, kernel_initializer=k_init, name='single_neuron')

In [20]:
loss = tf.losses.log_loss(labels=y_true, predictions=prob_positive)

We use aggressive choices for the learning rate and momentum since the optimization space is convex.

In [21]:
optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.95, use_nesterov=True)
training_op = optimizer.minimize(loss)

In [22]:
with tf.name_scope('my_metrics'):
     thres = 0.5
     y_pred = tf.squeeze(prob_positive) > thres

     cm = tf.confusion_matrix(labels=tf.squeeze(y_true), predictions=y_pred, num_classes=2)
     cm = tf.cast(cm, tf.float64)

     acc = tf.trace(cm) / tf.reduce_sum(cm)

In [23]:
init = tf.global_variables_initializer()

In [24]:
n_iterations = 10000

In [25]:
with tf.Session() as sess:
     init.run()
     
     for iteration in range(n_iterations + 1):
          sess.run(training_op)
          if (iteration % 1000 == 0): print iteration, loss.eval()

     print "accuracy =", acc.eval()
     print tf.get_default_graph().get_tensor_by_name('single_neuron/kernel:0').eval()
     print tf.get_default_graph().get_tensor_by_name('single_neuron/bias:0').eval()

0 2.34543
1000 0.0732991
2000 0.0732844
3000 0.0732834
4000 0.0732834
5000 0.0732833
6000 0.0732833
7000 0.0732833
8000 0.0732833
9000 0.0732834
10000 0.0732834
accuracy = 0.977152899824
[[-2.89109576]
 [ 1.59170616]
 [ 0.49850542]
 [ 0.78415945]
 [ 1.28068187]]
[ 0.41372069]


In [26]:
[v.name for v in tf.trainable_variables()]

[u'single_neuron/kernel:0', u'single_neuron/bias:0']

### Spark

In [27]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[2]").appName("three_library_test").getOrCreate()

In [28]:
cmb = np.c_[bc.target, X_std_pca]
column_names = ['target'] + ['feature_' + str(i) for i in range(5)]
df = spark.createDataFrame(pd.DataFrame(cmb, columns=column_names))
df.show(10)

+------+------------------+-------------------+-------------------+--------------------+--------------------+
|target|         feature_0|          feature_1|          feature_2|           feature_3|           feature_4|
+------+------------------+-------------------+-------------------+--------------------+--------------------+
|   0.0|  9.19283682563216| 1.9485831590271292| -1.123166076073326|  3.6337348703627397| -1.1951374451040437|
|   0.0|2.3878017960782025|-3.7681717918492885|-0.5292927378150812|  1.1182616283301683|  0.6217902864584048|
|   0.0| 5.733896279906637| -1.075173798333321|-0.5517476287303581|   0.912082591445986| -0.1770845747487285|
|   0.0| 7.122953197853179| 10.275589129731346| -3.232789566421863| 0.15254741039104347| -2.9608802772237577|
|   0.0| 3.935302073893639| -1.948071601747983|   1.38976671820812|  2.9406377469789184|   0.546757894181993|
|   0.0|2.3802471505604657|  3.949928886346472| -2.934876820805967|  0.9410364197547301| -1.0560384180365752|
|   0.0| 2

In [29]:
df.count()

569

In [30]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

h = df.rdd.map(lambda row: Row(label=row.target, features=Vectors.dense(row[1:]))).toDF()

In [31]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=100, threshold=0.5)
lr_clf = lr.fit(h)

In [32]:
print("Coefficients: " + str(lr_clf.coefficients))
print("Intercept: " + str(lr_clf.intercept))

Coefficients: [-2.8910792118,1.59169814328,0.49849966829,0.78415459731,1.28067998332]
Intercept: 0.413715036847


In [33]:
y_pred = lr_clf.transform(h)

In [34]:
y_pred.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[9.19283682563216...|  0.0|[22.3030248265287...|[0.99999999979397...|       0.0|
|[2.38780179607820...|  0.0|[11.0780490260307...|[0.99998455251689...|       0.0|
|[5.73389627990663...|  0.0|[17.6614063620266...|[0.99999997863274...|       0.0|
|[7.12295319785317...|  0.0|[9.10753461852113...|[0.99988918467046...|       0.0|
|[3.93530207389363...|  0.0|[10.3653621854615...|[0.99996849592609...|       0.0|
|[2.38024715056046...|  0.0|[2.25823789503008...|[0.90535875416154...|       0.0|
|[2.23888330789306...|  0.0|[10.9894960489813...|[0.99998312222561...|       0.0|
|[2.14329850051198...|  0.0|[4.42013905715466...|[0.98811050212910...|       0.0|
|[3.17492429305038...|  0.0|[7.34290059884128...|[0.99935324794542...|       0.0|
|[6.351746790852

In [35]:
predictions_and_labels = y_pred.select('prediction', 'label')
predictions_and_labels = predictions_and_labels.withColumn('prediction', predictions_and_labels['prediction'].cast('integer'))
predictions_and_labels.show(5)

+----------+-----+
|prediction|label|
+----------+-----+
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
+----------+-----+
only showing top 5 rows



In [36]:
tp = predictions_and_labels.filter('prediction == 1 and label == 1').count()
tp

352

In [37]:
fp = predictions_and_labels.filter('prediction == 1 and label == 0').count()
fp

8

In [38]:
tn = predictions_and_labels.filter('prediction == 0 and label == 0').count()
tn

204

In [39]:
fn = predictions_and_labels.filter('prediction == 0 and label == 1').count()
fn

5

In [40]:
accuracy = float(tp + tn) / (tp + tn + fp + fn)
accuracy

0.9771528998242531