In [0]:
# ===============================================
# CHICAGO PARKING TICKETS - ML CLASSIFICATION
# Predict PaymentIsOutstanding
# ===============================================

# 1️⃣ Imports
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# ===============================================
# 2️⃣ Load Dataset
# ===============================================

file_path = "/Volumes/hamzeh_databricks_workspace/default/hamzeh-volume/ChicagoParkingTickets.txt"

df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(file_path)

# Remove duplicates
df = df.dropDuplicates()

# ===============================================
# 3️⃣ Select Relevant Columns
# ===============================================

selected_cols = [
    "Community_Name",
    "Sector",
    "Side",
    "Hardship_Index",
    "Per_capita_income",
    "Percent_unemployed",
    "PaymentIsOutstanding"
]

df = df.select(*selected_cols)

# Drop rows with null target
df = df.dropna(subset=["PaymentIsOutstanding"])

# Convert target to numeric (if needed)
df = df.withColumn("PaymentIsOutstanding",
                   col("PaymentIsOutstanding").cast("int"))

# ===============================================
# 4️⃣ Feature Engineering
# ===============================================

# Encode categorical columns
indexer1 = StringIndexer(
    inputCol="Community_Name",
    outputCol="Community_index",
    handleInvalid="keep"
)

indexer2 = StringIndexer(
    inputCol="Sector",
    outputCol="Sector_index",
    handleInvalid="keep"
)

indexer3 = StringIndexer(
    inputCol="Side",
    outputCol="Side_index",
    handleInvalid="keep"
)

# Combine all features into vector
assembler = VectorAssembler(
    inputCols=[
        "Community_index",
        "Sector_index",
        "Side_index",
        "Hardship_Index",
        "Per_capita_income",
        "Percent_unemployed"
    ],
    outputCol="features"
)

# ===============================================
# 5️⃣ Define Model
# ===============================================

lr = LogisticRegression(
    featuresCol="features",
    labelCol="PaymentIsOutstanding"
)

# ===============================================
# 6️⃣ Build Pipeline
# ===============================================

pipeline = Pipeline(stages=[
    indexer1,
    indexer2,
    indexer3,
    assembler,
    lr
])

# ===============================================
# 7️⃣ Train/Test Split
# ===============================================

train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# ===============================================
# 8️⃣ Train Model
# ===============================================

model = pipeline.fit(train_df)

# ===============================================
# 9️⃣ Predict
# ===============================================

predictions = model.transform(test_df)

predictions.select(
    "features",
    "PaymentIsOutstanding",
    "prediction",
    "probability"
).show(5)

# ===============================================
# 🔟 Evaluate Model
# ===============================================

evaluator = BinaryClassificationEvaluator(
    labelCol="PaymentIsOutstanding",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)

print("AUC Score:", auc)


+--------------------+--------------------+----------+--------------------+
|            features|PaymentIsOutstanding|prediction|         probability|
+--------------------+--------------------+----------+--------------------+
|[22.0,1.0,3.0,53....|                   0|       0.0|[0.72474732443805...|
|[22.0,1.0,3.0,53....|                   0|       0.0|[0.72474732443805...|
|[22.0,1.0,3.0,53....|                   0|       0.0|[0.72474732443805...|
|[22.0,1.0,3.0,53....|                   0|       0.0|[0.72474732443805...|
|[22.0,1.0,3.0,53....|                   0|       0.0|[0.72474732443805...|
+--------------------+--------------------+----------+--------------------+
only showing top 5 rows
AUC Score: 0.5974475108716853
