In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('logreg').master('local[4]').getOrCreate()

In [4]:
from pyspark.ml.classification import LogisticRegression

In [6]:
data = spark.read.format('libsvm').load('../data/sample_libsvm_data.txt')

In [7]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [8]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



### Train Logistic Regression Model

In [9]:
lr_model = LogisticRegression(featuresCol='features', labelCol='label')

In [10]:
fitted_model = lr_model.fit(data)

In [13]:
summary = fitted_model.summary

In [16]:
summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [19]:
summary.predictions.select('label', 'prediction').show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
+-----+----------+
only showing top 20 rows



### Evaluate Model with Evaluator

In [24]:
train, test = data.randomSplit([0.7, 0.3])

In [25]:
train.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|                72|
|   mean|0.5833333333333334|
| stddev|0.4964663868173119|
|    min|               0.0|
|    max|               1.0|
+-------+------------------+



In [26]:
test.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|                28|
|   mean|0.5357142857142857|
| stddev|0.5078745001833701|
|    min|               0.0|
|    max|               1.0|
+-------+------------------+



In [27]:
model = LogisticRegression()
fit_model = model.fit(train)
predictions = fit_model.evaluate(test)

In [38]:
from pyspark.sql.functions import round

In [41]:
predictions.predictions.select('label','probability', 'prediction').show()

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|  0.0|[0.99999999312459...|       0.0|
|  0.0|[0.99999982787116...|       0.0|
|  0.0|[0.99999999959502...|       0.0|
|  0.0|[0.99999999994139...|       0.0|
|  0.0|[0.99999999999978...|       0.0|
|  0.0|[0.99999999999761...|       0.0|
|  0.0|[0.99999999829805...|       0.0|
|  0.0|[0.99999999999053...|       0.0|
|  0.0|[0.99999999999324...|       0.0|
|  0.0|[0.99999653987468...|       0.0|
|  0.0|[0.99999684571316...|       0.0|
|  0.0|[0.99997595534439...|       0.0|
|  0.0|[0.99869636932692...|       0.0|
|  1.0|[8.13551543177790...|       1.0|
|  1.0|[4.36670708353779...|       1.0|
|  1.0|[1.56649949669066...|       1.0|
|  1.0|[1.84482117924702...|       1.0|
|  1.0|[5.77684223395539...|       1.0|
|  1.0|[1.71054662024181...|       1.0|
|  1.0|[1.81116464840821...|       1.0|
+-----+--------------------+----------+
only showing top 20 rows



In [37]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [46]:
evaluator = BinaryClassificationEvaluator()

In [49]:
roc_result = evaluator.evaluate(predictions.predictions)

In [52]:
roc_result

1.0