# Logistic Regression in PySpark

In [1]:
from pyspark.ml.classification import LogisticRegression

In [13]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("LogisticRegression").getOrCreate()
training = spark.read.format("libsvm").load(
    "/home/juan/2023/Spark-Tutorials/data/sample_libsvm_data.txt")

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

lrModel = lr.fit(training)

# Print the coefficients and intercept for logistic regression



23/09/17 19:09:34 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [16]:
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: (692,[272,300,323,350,351,378,379,405,406,407,428,433,434,435,455,456,461,462,483,484,489,490,496,511,512,517,539,540,568],[-7.520689871384e-05,-8.11577314684689e-05,3.8146927718465075e-05,0.0003776490540424333,0.00034051483661944016,0.0005514455157343123,0.0004085386116096918,0.00041974673327494546,0.0008119171358670042,0.000502770837266876,-2.3929260406599642e-05,0.0005745048020902312,0.0009037546426803624,7.818229700243747e-05,-2.178755195291058e-05,-3.4021658217894325e-05,0.0004966517360637645,0.0008190557828370383,-8.017982139522497e-05,-2.7431694037834025e-05,0.00048108322262389945,0.00048408017626778825,-8.92647292000764e-06,-0.0003414881233042727,-8.95059257412124e-05,0.00048645469116892205,-8.478698005185958e-05,-0.00042347832158317646,-7.296535777631108e-05])
Intercept: -0.5991460286401453


In [20]:
# Print the training schema
training.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [21]:
training.count()

100

In [25]:
# We can also use mutinomial familiy for binary calssification
mlr = LogisticRegression(family='multinomial',
maxIter=10, 
regParam=0.3, 
elasticNetParam=0.8)

# Fit the model
mlrModel = mlr.fit(training)

In [26]:
# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

Multinomial coefficients: 2 X 692 CSRMatrix
(0,272) 0.0001
(0,300) 0.0001
(0,350) -0.0002
(0,351) -0.0001
(0,378) -0.0003
(0,379) -0.0002
(0,405) -0.0002
(0,406) -0.0004
(0,407) -0.0002
(0,433) -0.0003
(0,434) -0.0005
(0,435) -0.0001
(0,456) 0.0
(0,461) -0.0002
(0,462) -0.0004
(0,483) 0.0001
..
..
Multinomial intercepts: [0.2750587585718048,-0.2750587585718048]


In [27]:
# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

objectiveHistory:
0.6833149135741672
0.6661906127558117
0.6207433672479603
0.6131541253123871
0.605914968995239
0.5923656241678249
0.5898233082838021
0.5868012627420286
0.5844432058719141
0.5830790068041741
0.5807015754032355
+---+--------------------+
|FPR|                 TPR|
+---+--------------------+
|0.0|                 0.0|
|0.0|0.017543859649122806|
|0.0| 0.03508771929824561|
|0.0| 0.05263157894736842|
|0.0| 0.07017543859649122|
|0.0| 0.08771929824561403|
|0.0| 0.10526315789473684|
|0.0| 0.12280701754385964|
|0.0| 0.14035087719298245|
|0.0| 0.15789473684210525|
|0.0| 0.17543859649122806|
|0.0| 0.19298245614035087|
|0.0| 0.21052631578947367|
|0.0| 0.22807017543859648|
|0.0| 0.24561403508771928|
|0.0|  0.2631578947368421|
|0.0|  0.2807017543859649|
|0.0|  0.2982456140350877|
|0.0|  0.3157894736842105|
|0.0|  0.3333333333333333|
+---+--------------------+
only showing top 20 rows

areaUnderROC: 1.0


LogisticRegression_b7799696899b