# Jonathan Halverson
# Thursday, May 12, 2016
# Blood donations competition in Spark

We rework our logistic regression model from scikit-learn in Spark. We begin by loading various MLlib modules:

In [42]:
import csv
import numpy as np
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

The train and test data are read in and then transformed:

In [24]:
str_lines = sc.textFile("/Users/jhalverson/data_science/project_blood_donations/train_blood.csv")
train_rdd = str_lines.mapPartitions(lambda x: csv.reader(x)).filter(lambda x: 'Months' not in x[1]).map(lambda x: map(int, x))
str_lines = sc.textFile("/Users/jhalverson/data_science/project_blood_donations/test_blood.csv")
test_rdd = str_lines.mapPartitions(lambda x: csv.reader(x)).filter(lambda x: 'Months' not in x[1]).map(lambda x: map(int, x))

Below we print out the first two records of the training set after the above transformations:

In [25]:
train_rdd.take(2)

[[619, 2, 50, 12500, 98, 1], [664, 0, 13, 3250, 28, 1]]

Because the features are of equal importance, we standardize each one such that it has mean zero and variance unity:

In [43]:
stdsc = StandardScaler(withMean=True, withStd=True)
model_sc = stdsc.fit(train_rdd)
train_std = model_sc.transform(train_rdd)

Below we create an RDD of LabeledPoints to input into the train method of our model:

In [27]:
trainData = train_rdd.map(lambda x: LabeledPoint(x[5], np.asarray([x[1], x[2], x[4], float(x[2]) / x[4]])))
trainData.take(5)

[LabeledPoint(1.0, [2.0,50.0,98.0,0.510204081633]),
 LabeledPoint(1.0, [0.0,13.0,28.0,0.464285714286]),
 LabeledPoint(1.0, [1.0,16.0,35.0,0.457142857143]),
 LabeledPoint(1.0, [2.0,20.0,45.0,0.444444444444]),
 LabeledPoint(0.0, [1.0,24.0,77.0,0.311688311688])]

In [28]:
model = LogisticRegressionWithLBFGS.train(trainData, regParam=10)
model.clearThreshold()

In [29]:
model.predict([2, 20, 2, 0.5])

0.47183995397390593

In [30]:
testData = test_rdd.map(lambda x: (x[0], model.predict([x[1], x[2], x[4], float(x[2]) / x[4]])))
testData.take(3)

[(659, 0.46747633699967134),
 (276, 0.454705458709579),
 (263, 0.4860667899742851)]

In [31]:
def writeLines(records):
    for record in records:
        return '%d,%.2f\n' % (record[0], record[1])

In [40]:
output = testData.map(lambda x: '%d,%.3f' % (x[0], x[1]))

In [39]:
f = open('halverson_logistic_regression_june3.dat', 'w')
f.write(',Made Donation in March 2007\n')
for volunteer_id, prob in testData.collect():
  f.write('%d,%.3f\n' % (volunteer_id, prob))
f.close()