In [0]:
%pip install mlflow xgboost

%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

%restart_python

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import DataFrame, functions as F, types as T, Window

import builtins
from datetime import datetime
from typing import Optional, Dict, Union, List, Tuple, Any
import math
import random


import pandas as pd
import numpy as np
import sklearn

from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
import mlflow

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector, VectorUDT
from pyspark.ml import Pipeline, PipelineModel


from pyspark.ml.tuning import CrossValidatorModel, TrainValidationSplitModel, ParamGridBuilder, CrossValidator, TrainValidationSplit
from pyspark.storagelevel import StorageLevel

import matplotlib.pyplot as plt

from pyspark.sql.functions import round
import mlflow.spark
from mlflow.artifacts import download_artifacts


In [0]:
from config import *
from sampling import *
from tracking import *
from tuning import * 

In [0]:
LABEL_COL = "churn7"
DATE_FILTER = "2025-10-26"
DATE_INTERVAL = 30

# Payer split: None --> no split, "0" --> non-payer, "1,2" --> payer
#payer_split = "1,2"


#payer_split = None
#payer_split = "0,1"

In [0]:
spark.sql(f""" describe table {FEATURES_TABLE_NAME}""").display()

In [0]:
df = spark.sql(f"""select * from {FEATURES_TABLE_NAME}
                                where '{LABEL_COL}' is not null
                                and date between date_sub('{DATE_FILTER}',{DATE_INTERVAL}) AND '{DATE_FILTER}' """)

In [0]:
df.groupby(col('payer_type_cd')).agg(count('payer_type_cd')).display()

In [0]:
from pyspark.sql.types import StringType, NumericType

#.withColumn("payer_type_cd", when((col('payer_type_cd') == 'P')| (col('payer_type_cd')=='S'), 1).otherwise(0))
labels = ['churn3','churn5','churn7','churn14']
for label in labels:
    df = df.withColumn(label, when(col(label)==True,1).otherwise(0))

string_features = []
numerical_features = []

drop_cols = ['judi','date','ts_last_updated','processed_date','churn3','churn5','churn7','churn14']

for field in df.schema.fields:
    if isinstance(field.dataType, StringType) and field not in drop_cols:
        string_features.append(field.name)
    elif isinstance(field.dataType, NumericType) and field not in drop_cols:
        numerical_features.append(field.name)

In [0]:
mlflow.set_experiment(EXPERIMENT_NAME)

In [0]:
payer_split = "0,1"

if payer_split == None:
    numerical_features.remove('payer_type_cd')


churn_features = df.withColumn('label', col(LABEL_COL))

In [0]:
# Splitting by payer and non-payer?



### Stratified Sampling

strat_train, strat_val, strat_test = stratified_sampling(churn_features, P_TEST=0.2, P_VAL=0.2)

## Upsampling
strat_train_up, train_up_info = upsample_minority(churn_features)

## Undersampling
strat_train_under, train_under_info = undersample_majority(churn_features)



In [0]:
def get_safe_works_repartition(df):

    conf = spark.sparkContext.getConf()
    cores_per_exec = int(conf.get("spark.executor.cores", "1"))
    # executors = all JVMs except the driver
    num_exec = spark._jsc.sc().getExecutorMemoryStatus().size() - 1
    slots = __builtins__.max(1, cores_per_exec * __builtins__.max(1, num_exec))

    safe_workers = __builtins__.max(1, __builtins__.min(slots, 32))  # cap if you like
    df = df.repartition(safe_workers)  # match partitions to workers

    return df, safe_workers

In [0]:
# if num_workers > available slots, fitting fails
# determine number of workers and repartition the training data
strat_train, safe_workers = get_safe_works_repartition(strat_train)
strat_train_up, _ = get_safe_works_repartition(strat_train_up)
strat_train_under, _ = get_safe_works_repartition(strat_train_under)

In [0]:
# For XGBoost we don't need to standarize any features
indexers = [StringIndexer(inputCol=x, 
                          outputCol=x+"_index", 
                          handleInvalid="keep") for x in string_features]
indexed_cols = [ x+"_index" for x in string_features]

inputs = numerical_features + indexed_cols

vec_assembler = VectorAssembler(inputCols=inputs, outputCol='features', handleInvalid='keep')


# Now add the xgb model to the pipeline
#eval_metrics = ["auc", "aucpr", "logloss"]
eval_metrics = ["aucpr"]


#safe_workers=1

xgb = SparkXGBClassifier(
  features_col = "features",
  label_col = "label",
  num_workers = safe_workers,
  eval_metric = eval_metrics,
)

# Set the pipeline stages for the entire process
pipeline = Pipeline().setStages(indexers+[vec_assembler]+ [xgb])

In [0]:
churn_features.dtypes

In [0]:
'''
spec = {
    # "n_estimators": ("int_uniform", 50, 1000),
    "max_depth":  ("int_uniform", 8, 8), # Originally "max_depth":  ("int_uniform", 4, 8),
    #"gamma": ("uniform", 0.0, 0.2),
    #"learning_rate": ("uniform", 0.01,0.5),
    # "subsample": ("uniform", 0.7, 0.9),
    #"colsample_bytree": ("uniform", 0.7, 0.9),
    # "min_child_weight": ("int_uniform", 1, 5),
    #"reg_alpha": ("uniform", 0.0, 0.1),
    #"reg_lambda": ("int_uniform", 1, 10),
    #"colsample_bylevel": ("uniform", 0, 0.6),
}

# build random xgb param map
xgb_param_maps = build_random_param_maps(xgb, spec, n_samples=40, seed=7)


cv_xgb = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=xgb_param_maps,
    numFolds=5,
    seed=7,
    # parallelism=150
)

'''

In [0]:
spec = {
    "max_depth":  ("int_uniform", 8, 8), # Originally "max_depth":  ("int_uniform", 4, 8),
}

# build random xgb param map
xgb_param_maps = build_random_param_maps(xgb, spec, n_samples=40, seed=7)


cv_xgb = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=xgb_param_maps,
    numFolds=5,
    seed=7,
    # parallelism=150
)

In [0]:
import logging

# Set the MLflow logging level to INFO
logger = logging.getLogger("mlflow")
logger.setLevel(logging.INFO)

In [0]:
extra_tags = {'label': LABEL_COL,
              'payer_split':payer_split, 
              'date_filter':DATE_FILTER, 
              'date_interval':DATE_INTERVAL,
              'sampling':'upsample',
              'safe_workers':safe_workers,
              #date....?
              }

In [0]:
#####

# Modify and test the tracking functions (log all to the mlflow experiment, vs. the notebook)





###

In [0]:
# Train on upsampled data 

cv_xgb.setEvaluator(BinaryClassificationEvaluator(metricName="areaUnderPR"))
cv_xgb.fit(strat_train_up)