In [0]:
# Import packages

from pyspark.ml.evaluation import BinaryClassificationEvaluator
import mlflow
import pandas as pd


In [0]:
# Evaluation of 30-days and 180-days models based on independant dataset

df = spark.sql("""select A.processed_date,
            A.judi,
            A.payer_type_cd,
            case when A.churn_ind then 1 else 0 end as label,
            case when B.model_run_id in ("f5583d4499394f16aa8b35757ad46e81","0ca2ba8790884e228db8930d2d148169") then "30_days" else "180_days" end as model_type,
            case when B.model_run_id in ("f5583d4499394f16aa8b35757ad46e81","1c7aebb2b7174b58b2b78789bbd071ab") then "nonpayer" else "payer" end as model_split,
            B.churn_probability as probability
            from teams.data_science.pp_churn_actuals A
            left join teams.data_science.pp_churn_predictions B
            on A.processed_date = B.processed_date
            and A.judi = B.judi
            and A.label_name = B.label_name
            where A.label_name = "churn7"
            and B.processed_date <= "2025-11-30"
            """)

df_ref = df.select("model_type","model_split").distinct().toPandas()

pr_auc_lst = []

evaluator = BinaryClassificationEvaluator(
    rawPredictionCol="probability",  
    labelCol="label",                
    metricName="areaUnderPR"         
)

for idx, row in df_ref.iterrows():

    df_tmp = df.filter((df.model_type == row.model_type) & (df.model_split == row.model_split))
    pr_auc_lst.append(evaluator.evaluate(df_tmp))

df_ref["pr_auc"] = pr_auc_lst
df_ref = df_ref.sort_values(by = ["model_split","model_type"])

display(df_ref)


In [0]:


\# Performance visualization

# To be replaced by "PROD_PAYER" / "PROD_NONPAYER" or similar (so we don't need to hardcode the model_id)
df_model = pd.DataFrame({"model_run_id":["1c7aebb2b7174b58b2b78789bbd071ab","f60fa7549d7b4b4cbc19364a18578a8b"]})

model_name_lst = []
auc_pr_lst = []

for idx,row in df_model.iterrows():

    run = mlflow.get_run(row.model_run_id)
    model_name_lst.append(run.data.tags.get("mlflow.runName"))
    auc_pr_lst.append(run.data.metrics.get("area_under_pr"))

df_model["model_name"] = model_name_lst
df_model["auc_pr_ref"] = auc_pr_lst

spark.createDataFrame(df_model).createOrReplaceTempView("model_ref")

df_date = spark.sql("""select A.processed_date,
            case when A.churn_ind then 1 else 0 end as label,
            C.model_name,
            C.auc_pr_ref,
            B.churn_probability as probability
            from teams.data_science.pp_churn_actuals A
            left join teams.data_science.pp_churn_predictions B
            on A.processed_date = B.processed_date
            and A.judi = B.judi
            and A.label_name = B.label_name
            inner join model_ref C
            on B.model_run_id = C.model_run_id
            where A.label_name = "churn7"
            and B.processed_date <= "2025-11-30"
            """)

df_ref = df_date.select("processed_date","model_name","auc_pr_ref").distinct().toPandas()

pr_auc_lst = []

evaluator = BinaryClassificationEvaluator(
    rawPredictionCol="probability",  
    labelCol="label",                
    metricName="areaUnderPR"         
)

for idx, row in df_ref.iterrows():

    df_tmp = df_date.filter((df_date.processed_date == row.processed_date) & (df_date.model_name == row.model_name))
    pr_auc_lst.append(evaluator.evaluate(df_tmp))

df_ref["pr_auc"] = pr_auc_lst
df_ref = df_ref.sort_values(by = ["processed_date","model_name"])

display(df_ref)


Databricks visualization. Run in Databricks to view.