In [0]:
%pip install mlflow xgboost

%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

%restart_python

In [0]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import DataFrame, functions as F, types as T, Window

import builtins
from datetime import datetime
from typing import Optional, Dict, Union, List, Tuple, Any
import math
import random


import pandas as pd
import numpy as np
import sklearn

from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
import mlflow

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector, VectorUDT
from pyspark.ml import Pipeline, PipelineModel


from pyspark.ml.tuning import CrossValidatorModel, TrainValidationSplitModel, ParamGridBuilder, CrossValidator, TrainValidationSplit
from pyspark.storagelevel import StorageLevel

import matplotlib.pyplot as plt

from pyspark.sql.functions import round
import mlflow.spark
from mlflow.artifacts import download_artifacts


In [0]:
from config import *
from sampling import *
from tracking import *
from tuning import * 
#from Deprecated.deprecated_tracking import *

In [0]:
FEATURES_TABLE_NAME = 'teams.data_science.pp_churn_features'

In [0]:
spark.sql(f""" describe table {FEATURES_TABLE_NAME}""").display()

In [0]:
# Principal categories 
## Cluster models
### SHAP



##### 

In [0]:
# String into a number - 

In [0]:
# Can do this programmatically later 

drop_cols = ['judi','date','ts_last_updated','processed_date','churn3','churn5','churn7','churn14']

all_features = [col for col in df.columns if col not in drop_cols]

string_cols = ['market','attribution_source_cd','country_cd','payer_type_cd']


numerical_cols = [item for item in all_features if item not in string_cols]

In [0]:
numerical_cols

In [0]:
# Análisis of proportion of churn for different targets

display(spark.sql(f"""select count(case when churn3 = 1 then 1 end) / count(case when churn3 is not null then 1 end) as churn3_rt,
       count(case when churn5 = 1 then 1 end) / count(case when churn5 is not null then 1 end) as churn5_rt,
       count(case when churn7 = 1 then 1 end) / count(case when churn7 is not null then 1 end) as churn7_rt
        from {FEATURES_TABLE_NAME}"""))


In [0]:
q = f"""select dayofweek,
        count(case when churn3 = 1 then 1 end) / count(case when churn3 is not null then 1 end) as churn3_rt,
       count(case when churn5 = 1 then 1 end) / count(case when churn5 is not null then 1 end) as churn5_rt,
       count(case when churn7 = 1 then 1 end) / count(case when churn7 is not null then 1 end) as churn7_rt
        from {FEATURES_TABLE_NAME} group by dayofweek """

spark.sql(q).display()

Databricks data profile. Run in Databricks to view.

In [0]:
### TODO:

### Explore some top models trained - see if reducing num. vars improves performance
### payer/non-payer churn day 7 - trained during Nov 6 ish
### payer/non-payer churn day 7 -  also extend day back to n days - include month; percentage of month; 

### Later add to dataset 

In [0]:
#TODO:


### Explore feature importances to see if dayofweek is important... or day of month?  
### See if there are event drops or something on certain days of month - from game team (1/31)--> percentage; 1/28 --> percentage; to track progress of elapsed time in the month


### Monitor auc-PR also daily feature importances?

### look at feature importances - to see if day of week is significant 
### track the feature importance for different targets and re-training 
### log all the figures/plots/metrics/etc. 

### scoring metric: auc-PR


### some threshold for retraining - if the change is significant - retrain

### need some info from game team about any changes in features/holidays/seasonal whatever/events etc. 


### Defining schema for features table, cluster results, predictions/outputs - semi done 

### 

In [0]:
# Análisis of proportion of churn for different targets

display(spark.sql(f"""select count(case when churn3 = 1 then 1 end) / count(case when churn3 is not null then 1 end) as churn3_rt,
       count(case when churn5 = 1 then 1 end) / count(case when churn5 is not null then 1 end) as churn5_rt,
       count(case when churn7 = 1 then 1 end) / count(case when churn7 is not null then 1 end) as churn7_rt
        from {FEATURES_TABLE_NAME}"""))

In [0]:
LABEL_COL = "churn7"
#FEATURES_TABLE_NAME = "teams.data_science.pp_churn_features_v3_small"


DATE_FILTER = "2025-10-26"
DATE_INTERVAL = 30

# Payer split: None --> no split, "0" --> non-payer, "1,2" --> payer
payer_split = "1,2"




# Cluster vars: None --> no extra cluster vars


# These are loaded in config already
#EXPERIMENT_NAME = "/Users/krista@jamcity.com/PP-Churn-Model"
#FEATURES_TABLE_NAME = "teams.data_science.pp_churn_features"

In [0]:
#string_features = []
""" other_features = ['unique_levels_played', 'market_idx','dayofweek','rounds_played', 'avg_attempts', 'total_attempts', 'avg_moves', 'win_rate', 'assist_success_rate', 'unassist_success_rate', 'assist_rate', 'total_boosters_used', 'total_boosters_spent', 'used_boosters_rate', 'spend_boosters_rate', 'avg_difficulty_score', 'rate_hard_levels', 'rate_superhard_levels', 'min_room_id_int', 'max_room_id_int', 'daily_win_rate_ref', 'daily_avg_boosters_used_ref', 'daily_avg_boosters_spent_ref', 'attribution_source_cd_idx', 'country_cd_idx', 'payer_type_cd_idx', 'iap_lifetime_amt', 'days_since_install', 'days_since_last_purchase', 'ad_revenue_amt', 'iap_revenue_amt', 'session_qty', 'total_session_length_qty', 'avg_session_length', 'sessions_per_round', 'avg_population_wr_on_levels_played_today', 'avg_population_assisted_rate_today', 'avg_population_attempts_today', 'wr_diff_vs_population', 'attempts_diff_vs_population', 'assist_rate_diff_vs_population', 'active_days_l7d', 'total_rounds_l7d', 'avg_rounds_l7d', 'avg_win_rate_l7d', 'avg_attempts_l7d', 'boosters_used_l7d', 'avg_used_boosters_rate_l7d', 'active_days_l14d', 'avg_rounds_l14d', 'avg_win_rate_l14d', 'std_rounds_l14d', 'std_win_rate_l14d', 'active_days_l30d', 'avg_rounds_l30d', 'rounds_trend_weekly', 'win_rate_trend_weekly', 'boosters_usage_trend_weekly', 'rounds_ratio_7d_vs_14_7d', 'frequency_ratio_7d_vs_14d', 'levels_progressed_l7d', 'levels_progressed_l14d', 'levels_progressed_l30d', 'days_on_current_max_level', 'level_diversity_ratio',]

if cluster_vars is not None:
    other_features = other_features + cluster_vars
"""

In [0]:
FEATURES_TABLE_NAME

In [0]:
payer_split

In [0]:


# Get data from table

# If there is payer split
if payer_split is None:

    churn_features = spark.sql(f"""select * from {FEATURES_TABLE_NAME}
                                where '{LABEL_COL}' is not null
                                and date between date_sub('{DATE_FILTER}',{DATE_INTERVAL}) AND '{DATE_FILTER}' """)\
        .withColumn("label",col(LABEL_COL))

else:

    churn_features = spark.sql(f"""select * from {FEATURES_TABLE_NAME}
                                where '{LABEL_COL}' is not null
                                --and payer_type_cd_idx in ({payer_split})
                                and date between date_sub('{DATE_FILTER}',{DATE_INTERVAL}) AND '{DATE_FILTER}' """)\
        .withColumn("label",col(LABEL_COL))
    if payer_split == "0":
        other_features.remove("payer_type_cd")


In [0]:
from pyspark.sql.types import StringType, NumericType

#df = spark.sql(f""" select * from {FEATURES_TABLE_NAME}""")


string_features = []
numerical_features = []

drop_cols = ['judi','date','ts_last_updated','processed_date','churn3','churn5','churn7','churn14']

#df = df.withColumn('label', col(LABEL_COL))

churn_features = churn_features.withColumn("payer_type_cd", when(col('payer_type_cd') == 'P', 1).otherwise(0))


for field in churn_features.schema.fields:
    if isinstance(field.dataType, StringType) and field not in drop_cols:
        string_features.append(field.name)
    elif isinstance(field.dataType, NumericType) and field not in drop_cols:
        numerical_features.append(field.name)

In [0]:
churn_features =  spark.sql(f"""select * from {FEATURES_TABLE_NAME}
                                where '{LABEL_COL}' is not null
                                --and payer_type_cd_idx in ({payer_split})
                                and date between date_sub('{DATE_FILTER}',{DATE_INTERVAL}) AND '{DATE_FILTER}' """)

if payer_split == '0':
    numerical_features.remove("payer_type_cd")

In [0]:
# Get stratified train, validation, test set
strat_train, strat_val, strat_test = stratified_sampling(df, P_TEST=0.2, P_VAL=0.2)

In [0]:
# Undersample majority class
strat_train_under, train_under_info = undersample_majority(df)

In [0]:
#Upsample minority class
strat_train_up, train_up_info = upsample_minority(churn_features)

Build Pipeline for classification

In [0]:
mlflow.set_experiment(EXPERIMENT_NAME)

In [0]:
#TODO: would love to have a function that automatically sorts the columns by type
#drop_for_features = {"judi","date","churn3"} 
#feature_cols = [c for c in df.columns if c not in drop_for_features and c not in drop_cols]

In [0]:
def get_safe_works_repartition(df):

    conf = spark.sparkContext.getConf()
    cores_per_exec = int(conf.get("spark.executor.cores", "1"))
    # executors = all JVMs except the driver
    num_exec = spark._jsc.sc().getExecutorMemoryStatus().size() - 1
    slots = __builtins__.max(1, cores_per_exec * __builtins__.max(1, num_exec))

    safe_workers = __builtins__.max(1, __builtins__.min(slots, 32))  # cap if you like
    df = df.repartition(safe_workers)  # match partitions to workers

    return df, safe_workers

In [0]:
# if num_workers > available slots, fitting fails
# determine number of workers and repartition the training data
strat_train, safe_workers = get_safe_works_repartition(strat_train)
strat_train_up, _ = get_safe_works_repartition(strat_train_up)
strat_train_under, _ = get_safe_works_repartition(strat_train_under)

In [0]:
print(safe_workers)

# Build Pipeline

In [0]:
# For XGBoost we don't need to standarize any features
indexers = [StringIndexer(inputCol=x, 
                          outputCol=x+"_index", 
                          handleInvalid="keep") for x in string_features]
indexed_cols = [ x+"_index" for x in string_features]

inputs = other_features + indexed_cols

vec_assembler = VectorAssembler(inputCols=inputs, outputCol='features', handleInvalid='keep')


# Now add the xgb model to the pipeline
#eval_metrics = ["auc", "aucpr", "logloss"]
eval_metrics = ["aucpr"]

xgb = SparkXGBClassifier(
  features_col = "features",
  label_col = "label",
  num_workers = safe_workers,
  eval_metric = eval_metrics,
)

# Set the pipeline stages for the entire process
pipeline = Pipeline().setStages(indexers+[vec_assembler]+ [xgb])

You can fit your pipeline model here with MLFlow tracking...

In [0]:
# Param specs for random grid builder
spec = {
    # "n_estimators": ("int_uniform", 50, 1000),
    "max_depth":  ("int_uniform", 8, 8), # Originally "max_depth":  ("int_uniform", 4, 8),
    #"gamma": ("uniform", 0.0, 0.2),
    #"learning_rate": ("uniform", 0.01,0.5),
    # "subsample": ("uniform", 0.7, 0.9),
    #"colsample_bytree": ("uniform", 0.7, 0.9),
    # "min_child_weight": ("int_uniform", 1, 5),
    #"reg_alpha": ("uniform", 0.0, 0.1),
    #"reg_lambda": ("int_uniform", 1, 10),
    #"colsample_bylevel": ("uniform", 0, 0.6),
}

# build random xgb param map
xgb_param_maps = build_random_param_maps(xgb, spec, n_samples=40, seed=7)


cv_xgb = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=xgb_param_maps,
    numFolds=5,
    seed=7,
    # parallelism=150
)


In [0]:
import logging

# Set the MLflow logging level to INFO
logger = logging.getLogger("mlflow")
logger.setLevel(logging.INFO)


In [0]:
cv_xgb.setEvaluator(BinaryClassificationEvaluator(metricName="areaUnderPR"))
cv_xgb.fit(strat_train_up)

In [0]:
# Display results
experiment_lst = [{"run_id":"a679118511bd46c3b13b76bb22e24972","name":"XGB_30_churn7"},
                  {"run_id":"d68b7ac6064c4ce7989a5716fe4b2322","name":"XGB_30_churn3"},
                  {"run_id":"3fdd4de3b3bb42b9a765a50d1e2254e3","name":"XGB_30_churn5"},
                  {"run_id":"0ae51ea70f3b48cab60c0dc9ea8049fc","name":"XGB_30_churn7_non_payer"},
                  {"run_id":"954ff7ad42d04d0eaa29eee31d0de69b","name":"XGB_30_churn7_payer"},
                  {"run_id":"a7c6fab2d77846a8918e49368f9c7e1c","name":"XGB_30_churn7_non_payer_cluster"},
                  {"run_id":"d92501999472499d85e1e794716fb5b5","name":"XGB_30_churn7_payer_cluster"}]

df_lst = []

for experiment in experiment_lst:

    run_id = experiment["run_id"]

    artifact_path = download_artifacts(artifact_uri=f"runs:/{run_id}/search_results.csv")
    df_tmp = pd.read_csv(artifact_path)
    df_tmp["model"] = experiment["name"]
    df_lst.append(df_tmp)

import pandas as pd
df = pd.concat(df_lst, axis=0)

display(df.sort_values(["params","model"]))
