In [28]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression

## Set seed
seed = 42

In [29]:
## Create Spark Session
spark = SparkSession.builder.appName('logRegExample').getOrCreate()

In [30]:
## Load Data
df = spark.read.format('libsvm').load('gs://spark-training-data/datasets/sample_libsvm_data.txt')
df.show(5)

21/11/23 20:37:22 WARN org.apache.spark.ml.source.libsvm.LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
+-----+--------------------+
only showing top 5 rows



In [31]:
## Split into train, test
train_data, test_data = df.randomSplit([0.7,0.3], seed=seed)

In [32]:
## Create model instance and fit
logReg = LogisticRegression(labelCol='label', featuresCol='features', predictionCol='prediction')
logRegModel = logReg.fit(train_data)

In [33]:
## Print Predictions Schema
log_summary = logRegModel.summary
log_summary.predictions.printSchema()
log_summary.predictions.show(5)

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[95,96,97,12...|[21.2605832354368...|[0.99999999941568...|       0.0|
|  0.0|(692,[98,99,100,1...|[24.6172806585456...|[0.99999999997963...|       0.0|
|  0.0|(692,[121,122,123...|[21.3531473607449...|[0.99999999946734...|       0.0|
|  0.0|(692,[122,123,124...|[20.9193913009773...|[0.99999999917809...|       0.0|
|  0.0|(692,[122,123,148...|[20.8799716616791...|[0.99999999914504...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [34]:
## Evaluate the Model using test data
prediction_and_labels = logRegModel.evaluate(test_data)
prediction_and_labels.predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[100,101,102...|[12.2223514699748...|[0.99999508075940...|       0.0|
|  0.0|(692,[123,124,125...|[29.7047074388391...|[0.99999999999987...|       0.0|
|  0.0|(692,[123,124,125...|[20.9176718835119...|[0.99999999917667...|       0.0|
|  0.0|(692,[124,125,126...|[45.0073919136100...|           [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|[24.3368966091045...|[0.99999999997304...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [35]:
## Binary Classifcation Eval Example
bi_eval = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='rawPrediction', metricName='areaUnderROC')

In [36]:
bi_eval_final = bi_eval.evaluate(prediction_and_labels.predictions)
bi_eval_final

1.0