# Jonathan Halverson
# Thursday, May 12, 2016
# Blood donations competition in Spark

We rework our logistic regression model from scikit-learn in Spark. We begin by loading various MLlib modules:

In [1]:
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

The train and test data are read in, filtered and then mapped:

In [2]:
import csv
str_lines = sc.textFile("/Users/jhalverson/data_science/project_blood_donations/train_blood.csv")
train_rdd = str_lines.mapPartitions(lambda x: csv.reader(x)).filter(lambda x: 'Months' not in x[1]).map(lambda x: map(int, x))
str_lines = sc.textFile("/Users/jhalverson/data_science/project_blood_donations/test_blood.csv")
test_rdd = str_lines.mapPartitions(lambda x: csv.reader(x)).filter(lambda x: 'Months' not in x[1]).map(lambda x: map(int, x))

In [13]:
from pyspark.mllib.stat import Statistics
Statistics.corr(train_rdd, method='pearson')

array([[ 1.        , -0.023217  ,  0.06314818,  0.06314818,  0.09836977,
         0.04192463],
       [-0.023217  ,  1.        , -0.15973135, -0.15973135,  0.18689945,
        -0.26123371],
       [ 0.06314818, -0.15973135,  1.        ,  1.        ,  0.62211596,
         0.22061534],
       [ 0.06314818, -0.15973135,  1.        ,  1.        ,  0.62211596,
         0.22061534],
       [ 0.09836977,  0.18689945,  0.62211596,  0.62211596,  1.        ,
        -0.01981889],
       [ 0.04192463, -0.26123371,  0.22061534,  0.22061534, -0.01981889,
         1.        ]])

We remove the volunteer id and blood volume column -- which is perfectly correlated with number of donations -- and add a new features which is the average number of months between donations:

In [3]:
train_features = train_rdd.map(lambda x: (x[1], x[2], x[4], float(x[4]) / x[2]))
train_labels = train_rdd.map(lambda x: x[-1])
test_features = test_rdd.map(lambda x: (x[1], x[2], x[4], float(x[4]) / x[2]))
test_labels = test_rdd.map(lambda x: x[-1])

Below we print out the first two records of the training set after the above transformations:

In [4]:
train_features.take(2)

[(2, 50, 98, 1.96), (0, 13, 28, 2.1538461538461537)]

In [5]:
train_labels.take(2)

[1, 1]

The schema is "Months since Last Donation", "Number of Donations", "Total Volume Donated (c.c.)", "Months since First Donation", "Made Donation in March 2007". Because the features are of equal importance, we standardize each one such that it has mean zero and variance unity:

In [6]:
# the code fails when the scaler is fit to the training data then applied to the test data
stdsc1 = StandardScaler(withMean=True, withStd=True).fit(train_features)
train_features_std = stdsc1.transform(train_features)
stdsc2 = StandardScaler(withMean=True, withStd=True).fit(test_features)
test_features_std = stdsc2.transform(test_features)

Let's check to see if the standardizer worked correctly:

In [7]:
from pyspark.mllib.stat import Statistics
train_features_std_stats = Statistics.colStats(train_features_std)
print 'train means:', train_features_std_stats.mean()
print 'train variances:', train_features_std_stats.variance()
test_features_std_stats = Statistics.colStats(test_features_std)
print 'test means:', test_features_std_stats.mean()
print 'test means:', test_features_std_stats.variance()

train means: [  2.22044605e-16   2.49800181e-16  -1.38777878e-17   2.22044605e-16]
train variances: [ 1.  1.  1.  1.]
test means: [  3.40005801e-16  -2.35922393e-16  -3.46944695e-16  -5.11743425e-17]
test means: [ 1.  1.  1.  1.]


Below we create an RDD of LabeledPoints to input into the train method of our model:

In [8]:
import numpy as np
trainData = train_labels.zip(train_features_std)
trainData = trainData.map(lambda x: LabeledPoint(x[0], np.asarray(x[1:]))).cache()
trainData.take(5)

[LabeledPoint(1.0, [-0.909947819006,7.76530265869,2.63952945735,-0.911570589146]),
 LabeledPoint(1.0, [-1.15458256521,1.31932111074,-0.249728794865,-0.885241078432]),
 LabeledPoint(1.0, [-1.03226519211,1.84196826328,0.0391970303561,-0.880669982822]),
 LabeledPoint(1.0, [-0.909947819006,2.53883113333,0.451948209243,-0.872180805261]),
 LabeledPoint(0.0, [-1.03226519211,3.23569400338,1.77275198168,-0.742013415983])]

Next, we instaniate a LG model and carry out the training. It appears that Spark's Python API does not support cross-validation or grid search.

In [9]:
model = LogisticRegressionWithLBFGS.train(trainData, regParam=0.75)
model.clearThreshold()    

Let's examine the weights and intercept:

In [10]:
print model.weights, model.intercept

[-0.0955108011026,0.0868455744526,-0.0125492435503,-0.07990835791] 0.0


Next, we apply the model to the test data and write out the submission file:

In [11]:
testData = test_rdd.map(lambda x: x[0]).zip(test_features_std.map(lambda x: model.predict(x)))

In [12]:
f = open('halverson_logistic_regression_may13_2016.dat', 'w')
f.write(',Made Donation in March 2007\n')
for volunteer_id, prob in testData.collect():
  f.write('%d,%.3f\n' % (volunteer_id, prob))
f.close()