In [0]:
file_path = "/Volumes/hamzeh_databricks_workspace/default/hamzeh-volume/ChicagoParkingTickets.txt"

df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(file_path)


In [0]:
from pyspark.sql.functions import col

selected_cols = [
    "Per_capita_income",
    "Percent_unemployed",
    "Percent_households_below_poverty",
    "Hardship_Index",
    "PaymentIsOutstanding"
]

df_ml = df.select(*selected_cols).dropna()
df_ml.show(5)


+-----------------+------------------+--------------------------------+--------------+--------------------+
|Per_capita_income|Percent_unemployed|Percent_households_below_poverty|Hardship_Index|PaymentIsOutstanding|
+-----------------+------------------+--------------------------------+--------------+--------------------+
|          15957.0|              22.6|                            28.6|          73.0|                   1|
|          18881.0|              24.0|                            27.8|          60.0|                   0|
|          43198.0|               6.6|                            14.7|          10.0|                   0|
|          15089.0|              13.1|                            20.5|          71.0|                   0|
|          19713.0|              20.8|                            16.9|          48.0|                   0|
+-----------------+------------------+--------------------------------+--------------+--------------------+
only showing top 5 rows


In [0]:
df_ml = df_ml.withColumn(
    "label",
    col("PaymentIsOutstanding").cast("double")
)


In [0]:
from pyspark.ml.feature import VectorAssembler

feature_cols = [
    "Per_capita_income",
    "Percent_unemployed",
    "Percent_households_below_poverty",
    "Hardship_Index"
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

df_final = assembler.transform(df_ml).select("features", "label")
df_final.show(5)


+--------------------+-----+
|            features|label|
+--------------------+-----+
|[15957.0,22.6,28....|  1.0|
|[18881.0,24.0,27....|  0.0|
|[43198.0,6.6,14.7...|  0.0|
|[15089.0,13.1,20....|  0.0|
|[19713.0,20.8,16....|  0.0|
+--------------------+-----+
only showing top 5 rows


In [0]:
train_df, test_df = df_final.randomSplit([0.8, 0.2], seed=42)


In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label")

model = lr.fit(train_df)


In [0]:
predictions = model.transform(test_df)
predictions.select("features", "label", "prediction", "probability").show(5)


+--------------------+-----+----------+--------------------+
|            features|label|prediction|         probability|
+--------------------+-----+----------+--------------------+
|[8201.0,34.6,56.5...|  0.0|       1.0|[0.49106088307408...|
|[8201.0,34.6,56.5...|  0.0|       1.0|[0.49106088307408...|
|[8201.0,34.6,56.5...|  0.0|       1.0|[0.49106088307408...|
|[8201.0,34.6,56.5...|  0.0|       1.0|[0.49106088307408...|
|[8201.0,34.6,56.5...|  0.0|       1.0|[0.49106088307408...|
+--------------------+-----+----------+--------------------+
only showing top 5 rows


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
print("AUC:", auc)


AUC: 0.5989228126831527


In [0]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol="features", labelCol="label")
rf_model = rf.fit(train_df)

rf_predictions = rf_model.transform(test_df)

auc_rf = evaluator.evaluate(rf_predictions)
print("Random Forest AUC:", auc_rf)


Random Forest AUC: 0.5914078003539687
