# Jonathan Halverson
# Thursday, December 7, 2017
# Logistic regression in Tensorflow, Sklearn and Spark

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_breast_cancer

In [3]:
bc = load_breast_cancer()

In [4]:
y = bc.target
df = pd.DataFrame(bc.data, columns=bc.feature_names.tolist())
df.head().T

Unnamed: 0,0,1,2,3,4
mean radius,17.99,20.57,19.69,11.42,20.29
mean texture,10.38,17.77,21.25,20.38,14.34
mean perimeter,122.8,132.9,130.0,77.58,135.1
mean area,1001.0,1326.0,1203.0,386.1,1297.0
mean smoothness,0.1184,0.08474,0.1096,0.1425,0.1003
mean compactness,0.2776,0.07864,0.1599,0.2839,0.1328
mean concavity,0.3001,0.0869,0.1974,0.2414,0.198
mean concave points,0.1471,0.07017,0.1279,0.1052,0.1043
mean symmetry,0.2419,0.1812,0.2069,0.2597,0.1809
mean fractal dimension,0.07871,0.05667,0.05999,0.09744,0.05883


We see that the classes are not balanced:

In [5]:
np.bincount(bc.target)

array([212, 357])

Let's standardize the features and then apply PCA:

In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [7]:
std_sc = StandardScaler()
X_std = std_sc.fit_transform(bc.data)

In [8]:
pca = PCA(n_components=5)
X_std_pca = pca.fit_transform(X_std)

In [9]:
np.mean(X_std_pca, axis=0), np.std(X_std_pca, axis=0)

(array([  1.93557336e-16,   7.80473128e-18,  -8.58520441e-18,
          1.20973335e-16,   5.65843018e-18]),
 array([ 3.64439401,  2.38565601,  1.67867477,  1.40735229,  1.28402903]))

### Sklearn

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
lr = LogisticRegression(C=1e6)
lr.fit(X_std_pca, y)

LogisticRegression(C=1000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [12]:
lr.coef_

array([[-2.8905802 ,  1.5914023 ,  0.4983541 ,  0.78400633,  1.28055491]])

In [13]:
lr.intercept_

array([ 0.41356574])

In [14]:
lr.score(X_std_pca, y)

0.97715289982425313

In [15]:
lr.n_iter_

array([8], dtype=int32)

### Tensorflow

In [16]:
import tensorflow as tf

In [17]:
tf.reset_default_graph()

In [18]:
y_true = bc.target.reshape(-1, 1)
X = tf.constant(X_std_pca, dtype=tf.float64, name="X")
y = tf.constant(y_true, dtype=tf.int64, name="y")

In [19]:
k_init = tf.truncated_normal_initializer(mean=0.0, stddev=1.0, seed=42)
prob_positive = tf.layers.dense(inputs=X, units=1, activation=tf.sigmoid, kernel_initializer=k_init, name='single_neuron')

In [20]:
loss = tf.losses.log_loss(labels=y_true, predictions=prob_positive)

In [21]:
with tf.name_scope('my_metrics'):
     thres = 0.5
     y_pred = tf.squeeze(prob_positive) > thres

     cm = tf.confusion_matrix(labels=tf.squeeze(y_true), predictions=y_pred, num_classes=2)
     cm = tf.cast(cm, tf.float64)

     acc = tf.trace(cm) / tf.reduce_sum(cm)

In [22]:
optimizer = tf.train.MomentumOptimizer(learning_rate=0.001, momentum=0.9, use_nesterov=False)
training_op = optimizer.minimize(loss)

In [23]:
init = tf.global_variables_initializer()

In [24]:
with tf.Session() as sess:
     init.run()
     print "loss (initial) =", loss.eval()
     print "accuracy =", acc.eval()
     
     for i in range(3000):
          sess.run(training_op)

     print "loss (final) =", loss.eval()
     print "accuracy =", acc.eval()
     print tf.get_default_graph().get_tensor_by_name('single_neuron/kernel:0').eval()
     print tf.get_default_graph().get_tensor_by_name('single_neuron/bias:0').eval()

loss (initial) = 2.44557
accuracy = 0.205623901582
loss (final) = 0.0839184
accuracy = 0.971880492091
[[-1.82378454]
 [ 0.89285922]
 [ 0.39767794]
 [ 0.57184002]
 [ 0.56208278]]
[ 0.51768224]


In [25]:
[v.name for v in tf.trainable_variables()]

[u'single_neuron/kernel:0', u'single_neuron/bias:0']

### Spark

In [26]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

spark = SparkSession.builder.master("local[2]").appName("three_library_test").getOrCreate()

In [27]:
cmb = np.c_[bc.target, X_std_pca]
column_names = ['target'] + ['feature_' + str(i) for i in range(5)]
df = spark.createDataFrame(pd.DataFrame(cmb, columns=column_names))
df.show(10)

+------+------------------+-------------------+-------------------+--------------------+--------------------+
|target|         feature_0|          feature_1|          feature_2|           feature_3|           feature_4|
+------+------------------+-------------------+-------------------+--------------------+--------------------+
|   0.0| 9.192836826142157| 1.9485830731991052| -1.123166127007871|   3.633731790855738|    -1.1951092175522|
|   0.0|2.3878017959452285| -3.768171741418689|-0.5292926635886008|  1.1182632599189581|  0.6217750120548866|
|   0.0| 5.733896279604369|-1.0751737991464656|-0.5517476143146954|  0.9120829949070246|-0.17708650621326252|
|   0.0| 7.122953197409996| 10.275589111038476| -3.232789595452075| 0.15254649714667254|  -2.960881719770226|
|   0.0| 3.935302073636664|-1.9480715689515167| 1.3897667047127258|  2.9406391768165223|  0.5467469932053874|
|   0.0| 2.380247150199617| 3.9499288909675863|-2.9348767566782508|  0.9410368694200515| -1.0560432525590573|
|   0.0| 2

In [28]:
df.count()

569

In [29]:
h = df.rdd.map(lambda row: Row(label=row.target, features=Vectors.dense(row[1:]))).toDF()

In [30]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=100, threshold=0.5)
lr_clf = lr.fit(h)

In [31]:
print("Coefficients: " + str(lr_clf.coefficients))
print("Intercept: " + str(lr_clf.intercept))

Coefficients: [-2.89107809512,1.5916978433,0.49850064638,0.784154467855,1.28068147605]
Intercept: 0.413717077


In [32]:
y_pred = lr_clf.transform(h)

In [33]:
y_pred.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[9.19283682614215...|  0.0|[22.3029828862700...|[0.99999999979396...|       0.0|
|[2.38780179594522...|  0.0|[11.0780610879649...|[0.99998455270321...|       0.0|
|[5.73389627960436...|  0.0|[17.6614006689654...|[0.99999997863262...|       0.0|
|[7.12295319740999...|  0.0|[9.10753791484295...|[0.99988918503570...|       0.0|
|[3.93530207363666...|  0.0|[10.3653661649202...|[0.99996849605145...|       0.0|
|[2.38024715019961...|  0.0|[2.25824474910382...|[0.90535934144629...|       0.0|
|[2.23888330825816...|  0.0|[10.9894878878073...|[0.99998312208787...|       0.0|
|[2.14329850030276...|  0.0|[4.42014637837920...|[0.98811058813955...|       0.0|
|[3.17492429281157...|  0.0|[7.34289176546549...|[0.99935324223608...|       0.0|
|[6.351746789271

In [34]:
predictions_and_labels = y_pred.select('prediction', 'label')
predictions_and_labels = predictions_and_labels.withColumn('prediction', predictions_and_labels['prediction'].cast('integer'))
predictions_and_labels.show(5)

+----------+-----+
|prediction|label|
+----------+-----+
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
+----------+-----+
only showing top 5 rows



In [35]:
tp = predictions_and_labels.filter('prediction == 1 and label == 1').count()
tp

352

In [36]:
fp = predictions_and_labels.filter('prediction == 1 and label == 0').count()
fp

8

In [37]:
tn = predictions_and_labels.filter('prediction == 0 and label == 0').count()
tn

204

In [38]:
fn = predictions_and_labels.filter('prediction == 0 and label == 1').count()
fn

5

In [39]:
accuracy = float(tp + tn) / (tp + tn + fp + fn)
accuracy

0.9771528998242531