# Import libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml import Pipeline
from pyspark.sql.functions import expr

# Read historical customer churn data from silver catalog

In [None]:
training_data = spark.read.table("silver.historical.customers")
# drop non signifcant column
training_data = training_data.drop("year", "sentimentScore_str")

# Process categorical and numeric attributes.

In [None]:
categorical_cols = ["gender", "partner", "dependents", "phoneservice", "multiplelines", "internetservice", "onlinesecurity", "onlinebackup", "deviceprotection", "techsupport", "streamingtv", "streamingmovies", "contract", "paperlessbilling", "paymentmethod"]
indexers = [
    StringIndexer(inputCol=c, outputCol=c + "_idx", handleInvalid="skip")
    for c in categorical_cols
]
encoders = [OneHotEncoder(inputCol=c + "_idx", outputCol=c + "_vec") for c in categorical_cols]

# Numerical columns
numeric_cols = ["tenure", "monthlycharges", "totalcharges", "sentimentScore"]

# Assemble features
assembler = VectorAssembler(
    inputCols=[c + "_vec" for c in categorical_cols] + numeric_cols,
    outputCol="features_raw"
)

# Scale numerical features (optional but good practice)
scaler = StandardScaler(inputCol="features_raw", outputCol="features")

# Set up Logistic Regression model within a Pipeline in Spark ML

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="churn")

pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, lr])

# Split training data into train and test and train the model

In [None]:
train, test = training_data.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train)

# Predict customer churn using trained model

In [None]:
predictions = model.transform(test)

# Keep customer_id with predictions
result = predictions.select("customerid", "probability", "prediction", "churn")

result.show(truncate=False)

# Evaluate the model accuracy

In [None]:
# Extract probability and label
scoreAndLabels = predictions.select("probability", "churn") \
    .rdd.map(lambda row: (float(row.probability[1]), float(row.churn)))

metrics = BinaryClassificationMetrics(scoreAndLabels)

print("Accuracy: ", round(metrics.areaUnderROC*100 , 1), "%")

# Save the model

In [None]:
path = "/Workspace/model/customer_churn/ml_model"
model.write().overwrite().save(path)
print("Churn Prediction model saved at: " + path)