# Logistic Regression with Spark ML

Dr Jose M. Albornoz, January 2019

In [9]:
//import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression}

import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression}


# 1.- Load training data

In [10]:
val training = spark.read.format("libsvm").load("/home/jmalbornoz/Downloads/spark-2.4.0-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")

2019-01-09 16:16:03 WARN  LibSVMFileFormat:66 - 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


training: org.apache.spark.sql.DataFrame = [label: double, features: vector]


In [22]:
training.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



# 2.- Invoke logistic regression object. elasticNetParam corresponds to α and regParam corresponds to λ 

In [11]:
val lr = new LogisticRegression()
  .setMaxIter(10)
  .setRegParam(0.3)
  .setElasticNetParam(0.8)

lr: org.apache.spark.ml.classification.LogisticRegression = logreg_84e198909871


# 3.- Fit the model

In [12]:
val lrModel = lr.fit(training)

lrModel: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid = logreg_84e198909871, numClasses = 2, numFeatures = 692


# 4.- Print coefficient and intercept for logistic regression

In [13]:
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

Coefficients: (692,[244,263,272,300,301,328,350,351,378,379,405,406,407,428,433,434,455,456,461,462,483,484,489,490,496,511,512,517,539,540,568],[-7.353983524188197E-5,-9.102738505589466E-5,-1.9467430546904298E-4,-2.0300642473486668E-4,-3.1476183314863995E-5,-6.842977602660743E-5,1.5883626898239883E-5,1.4023497091372047E-5,3.5432047524968605E-4,1.1443272898171087E-4,1.0016712383666666E-4,6.014109303795481E-4,2.840248179122762E-4,-1.1541084736508837E-4,3.85996886312906E-4,6.35019557424107E-4,-1.1506412384575676E-4,-1.5271865864986808E-4,2.804933808994214E-4,6.070117471191634E-4,-2.008459663247437E-4,-1.421075579290126E-4,2.739010341160883E-4,2.7730456244968115E-4,-9.838027027269332E-5,-3.808522443517704E-4,-2.5315198008555033E-4,2.7747714770754307E-4,-2.443619763919199E-4,-0.0015394744687597765,-2.3073328411331293E-4]) Intercept: 0.22456315961250325


# 5.- We can also use the multinomial family for binary classification

In [14]:
val mlr = new LogisticRegression()
  .setMaxIter(10)
  .setRegParam(0.3)
  .setElasticNetParam(0.8)
  .setFamily("multinomial")

mlr: org.apache.spark.ml.classification.LogisticRegression = logreg_fedc48108156


In [15]:
val mlrModel = mlr.fit(training)

mlrModel: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid = logreg_fedc48108156, numClasses = 2, numFeatures = 692


# 6.- Print the coefficients and intercepts for logistic regression with multinomial family

In [16]:
println(s"Multinomial coefficients: ${mlrModel.coefficientMatrix}")
println(s"Multinomial intercepts: ${mlrModel.interceptVector}")

Multinomial coefficients: 2 x 692 CSCMatrix
(0,244) 4.290365458958277E-5
(1,244) -4.290365458958294E-5
(0,263) 6.488313287833108E-5
(1,263) -6.488313287833092E-5
(0,272) 1.2140666790834663E-4
(1,272) -1.2140666790834657E-4
(0,300) 1.3231861518665612E-4
(1,300) -1.3231861518665607E-4
(0,350) -6.775444746760509E-7
(1,350) 6.775444746761932E-7
(0,351) -4.899237909429297E-7
(1,351) 4.899237909430322E-7
(0,378) -3.5812102770679596E-5
(1,378) 3.581210277067968E-5
(0,379) -2.3539704331222065E-5
(1,379) 2.353970433122204E-5
(0,405) -1.90295199030314E-5
(1,405) 1.90295199030314E-5
(0,406) -5.626696935778909E-4
(1,406) 5.626696935778912E-4
(0,407) -5.121519619099504E-5
(1,407) 5.1215196190995074E-5
(0,428) 8.080614545413342E-5
(1,428) -8.080614545413331E-5
(0,433) -4.256734915330487E-5
(1,433) 4.256734915330495E-5
(0,434) -7.080191510151425E-4
(1,434) 7.080191510151435E-4
(0,455) 8.094482475733589E-5
(1,455) -8.094482475733582E-5
(0,456) 1.0433687128309833E-4
(1,456) -1.0433687128309814E-4
(0,46

# 7.-  Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example

In [17]:
val trainingSummary = lrModel.summary

trainingSummary: org.apache.spark.ml.classification.LogisticRegressionTrainingSummary = org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummaryImpl@396171a8


# 8.- Obtain the objective per iteration.

In [18]:
val objectiveHistory = trainingSummary.objectiveHistory
println("objectiveHistory:")
objectiveHistory.foreach(loss => println(loss))

objectiveHistory:
0.6833149135741672
0.6662875751473734
0.6217068546034618
0.6127265245887887
0.6060347986802873
0.6031750687571562
0.5969621534836274
0.5940743031983118
0.5906089243339022
0.5894724576491042
0.5882187775729587


objectiveHistory: Array[Double] = Array(0.6833149135741672, 0.6662875751473734, 0.6217068546034618, 0.6127265245887887, 0.6060347986802873, 0.6031750687571562, 0.5969621534836274, 0.5940743031983118, 0.5906089243339022, 0.5894724576491042, 0.5882187775729587)


# 9.- Obtain the metrics useful to judge performance on test data.

In [19]:
// We cast the summary to a BinaryLogisticRegressionSummary since the problem is a binary classification problem.
val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary]

binarySummary: org.apache.spark.ml.classification.BinaryLogisticRegressionSummary = org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummaryImpl@396171a8


# 10.- Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.

In [20]:
val roc = binarySummary.roc
roc.show()
println(s"areaUnderROC: ${binarySummary.areaUnderROC}")

+---+--------------------+
|FPR|                 TPR|
+---+--------------------+
|0.0|                 0.0|
|0.0|0.017543859649122806|
|0.0| 0.03508771929824561|
|0.0| 0.05263157894736842|
|0.0| 0.07017543859649122|
|0.0| 0.08771929824561403|
|0.0| 0.10526315789473684|
|0.0| 0.12280701754385964|
|0.0| 0.14035087719298245|
|0.0| 0.15789473684210525|
|0.0| 0.17543859649122806|
|0.0| 0.19298245614035087|
|0.0| 0.21052631578947367|
|0.0| 0.22807017543859648|
|0.0| 0.24561403508771928|
|0.0|  0.2631578947368421|
|0.0|  0.2807017543859649|
|0.0|  0.2982456140350877|
|0.0|  0.3157894736842105|
|0.0|  0.3333333333333333|
+---+--------------------+
only showing top 20 rows

areaUnderROC: 1.0


roc: org.apache.spark.sql.DataFrame = [FPR: double, TPR: double]


# 11.- Set the model threshold to maximize F-Measure

In [23]:
val fMeasure = binarySummary.fMeasureByThreshold
val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0)
val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure)
  .select("threshold").head().getDouble(0)

lrModel.setThreshold(bestThreshold)

fMeasure: org.apache.spark.sql.DataFrame = [threshold: double, F-Measure: double]
maxFMeasure: Double = 1.0
bestThreshold: Double = 0.5585022394278357
res8: lrModel.type = LogisticRegressionModel: uid = logreg_84e198909871, numClasses = 2, numFeatures = 692
