In [0]:
display(dbutils.fs.ls("dbfs:/FileStore/"))


path,name,size,modificationTime
dbfs:/FileStore/tables/,tables/,0,0


In [0]:
# Load preprocessed data
data = spark.read.parquet("dbfs:/FileStore/tables/preprocessed_data.parquet")


Split the data into training and test sets , 80% for train , 20% for test

In [0]:

train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)


Training a Logistic Regression Model

In [0]:
final_df = spark.read.parquet("dbfs:/FileStore/tables/preprocessed_data.parquet")


In [0]:
final_df.columns


Out[5]: ['features', 'label_column_name']

In [0]:
from pyspark.ml.feature import StringIndexer


In [0]:
indexer = StringIndexer(inputCol="label_column_name", outputCol="label")
final_df = indexer.fit(final_df).transform(final_df)


churn by mistake had an alias named label_column_name , we are proceeding with this alias

Model Training with Logistic Regression

In [0]:
from pyspark.ml.classification import LogisticRegression


lr = LogisticRegression(featuresCol="features", labelCol="label")


lr_model = lr.fit(final_df)


predictions = lr_model.transform(final_df)


predictions.select("label", "prediction", "features").show(5)


+-----+----------+--------------------+
|label|prediction|            features|
+-----+----------+--------------------+
|  0.0|       0.0|(45,[0,1,3,4,7,8,...|
|  0.0|       0.0|(45,[0,1,2,4,6,8,...|
|  1.0|       0.0|(45,[0,1,2,4,6,8,...|
|  0.0|       0.0|(45,[0,1,2,4,6,8,...|
|  1.0|       1.0|(45,[0,1,3,4,6,8,...|
+-----+----------+--------------------+
only showing top 5 rows



In [0]:
predictions.write.mode("overwrite").parquet("dbfs:/FileStore/tables/predictions.parquet")


In [0]:
# Save to DBFS (Databricks File System)
lr_model.write().overwrite().save("dbfs:/models/logistic_model")


In [0]:
data.show()

+--------------------+-----------------+
|            features|label_column_name|
+--------------------+-----------------+
|(45,[0,1,3,4,7,8,...|               No|
|(45,[0,1,2,4,6,8,...|               No|
|(45,[0,1,2,4,6,8,...|              Yes|
|(45,[0,1,2,4,6,8,...|               No|
|(45,[0,1,3,4,6,8,...|              Yes|
|(45,[0,1,3,4,6,8,...|              Yes|
|(45,[0,1,2,4,6,9,...|               No|
|(45,[0,1,3,4,6,8,...|               No|
|(45,[0,1,3,4,7,8,...|              Yes|
|(45,[0,1,2,4,6,9,...|               No|
|(45,[0,1,2,4,7,9,...|               No|
|(45,[0,1,2,4,6,8,...|               No|
|(45,[0,1,2,4,7,8,...|               No|
|(45,[0,1,2,4,6,8,...|              Yes|
|(45,[0,1,2,4,6,8,...|               No|
|(45,[0,1,3,4,7,9,...|               No|
|(45,[0,1,3,4,6,8,...|               No|
|(45,[0,1,2,4,6,9,...|               No|
|(45,[0,1,3,4,7,9,...|              Yes|
|(45,[0,1,3,4,6,8,...|               No|
+--------------------+-----------------+
only showing top

In [0]:
train_data.count()

Out[12]: 5690

In [0]:
final_df.columns

Out[13]: ['features', 'label_column_name', 'label']

In [0]:
lr_model.summary

Out[14]: <pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary at 0x71bb69198700>

In [0]:
final_df.columns

Out[15]: ['features', 'label_column_name', 'label']

In [0]:
test_data.count()

Out[16]: 1342

In [0]:
print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)


Coefficients: [-0.7552237788109915,-0.70958985062709,-0.012994232934743118,0.012994232934633781,-0.09512477465538137,0.09512477465507672,0.035325932080528984,-0.03532593208058972,0.08217066757766664,-0.08217066757774594,0.1296974145811207,-0.12969741458172607,-0.17066019755356585,0.22113077191983782,-0.12969741458172607,0.8997731001130755,-0.7555509069423401,-0.30297631667290587,0.21937981822022304,-0.017166644085817282,-0.30297631667290587,0.10728766471255731,0.11026126067071619,-0.30297631667290587,0.04189564768823214,0.18176998959144655,-0.30297631667290587,0.19050352501656026,0.01804099942915254,-0.30297631667290587,-0.1759660531259269,0.3953231158044507,-0.30297631667290587,-0.18282213314346274,0.400112788814289,-0.30297631667290587,0.7399660026117043,-0.9202263613373681,-0.09302211431029873,0.15895179657389705,-0.158951796574069,0.24681387013728634,0.0030103860548889826,-0.12278153432560665,-0.20412108017223404]
Intercept: -2.0553083691675447


In [0]:
training_summary = lr_model.summary
print("Accuracy:", training_summary.accuracy)
print("Area under ROC:", training_summary.areaUnderROC)
training_summary.roc.show()  # Shows ROC curve
training_summary.pr.show()   # Shows precision-recall curve


Accuracy: 0.8006257110352674
Area under ROC: 0.8405069118072368
+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|                 0.0|0.003210272873194...|
|1.936858415649816E-4|0.005885500267522739|
|1.936858415649816E-4| 0.00909577314071696|
|1.936858415649816E-4|0.012306046013911182|
|1.936858415649816E-4|0.015516318887105404|
|3.873716831299632E-4| 0.01819154628143392|
|3.873716831299632E-4|0.021401819154628143|
|5.810575246949448E-4|0.024077046548956663|
|0.001162115049389...|0.025682182985553772|
|0.001549486732519...|0.027822364901016586|
|0.001936858415649816|  0.0299625468164794|
|0.002130544257214...| 0.03263777421080792|
|0.002324230098779779| 0.03531300160513644|
|0.002517915940344...|0.037988228999464954|
|0.002517915940344...| 0.04119850187265917|
|0.002517915940344...|  0.0444087747458534|
|0.002905287623474724| 0.04654895666131621|
|0.002905287