In [0]:
%pip install mlflow xgboost


%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

%restart_python

In [0]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import DataFrame, functions as F, types as T, Window

import builtins
from datetime import datetime
from typing import Optional, Dict, Union, List, Tuple, Any
import math
import random


import pandas as pd
import numpy as np
import sklearn

from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
import mlflow

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector, VectorUDT
from pyspark.ml import Pipeline, PipelineModel


from pyspark.ml.tuning import CrossValidatorModel, TrainValidationSplitModel, ParamGridBuilder, CrossValidator, TrainValidationSplit
from pyspark.storagelevel import StorageLevel

import matplotlib.pyplot as plt

from pyspark.sql.functions import round
import mlflow.spark 

In [0]:
from config import *
from sampling import *
from tracking import *
from tuning import * 
#from Deprecated.deprecated_tracking import *

In [0]:
LABEL_COL = "churn3"
#FEATURES_TABLE_NAME = "teams.data_science.pp_churn_features_v3_small"


#DATE_FILTER = "2025-10-17"
#DATE_INTERVAL = 90



# These are loaded in config already
#EXPERIMENT_NAME = "/Users/krista@jamcity.com/PP-Churn-Model"
#FEATURES_TABLE_NAME = "teams.data_science.pp_churn_features"

In [0]:
# Get data from table
churn_features = spark.sql(f"select * from {FEATURES_TABLE_NAME}")\
    .withColumn("label",col(LABEL_COL))

In [0]:
# Get stratified train, validation, test set
strat_train, strat_val, strat_test = stratified_sampling(churn_features, P_TEST=0.2, P_VAL=0.2)

In [0]:
# Undersample majority class
strat_train_under, train_under_info = undersample_majority(churn_features)

In [0]:
#Upsample minority class
strat_train_up, train_up_info = upsample_minority(churn_features)

Build Pipeline for classification

In [0]:
mlflow.set_experiment(EXPERIMENT_NAME)

In [0]:
#TODO: would love to have a function that automatically sorts the columns by type
#drop_for_features = {"judi","date","churn3"} 
#feature_cols = [c for c in df.columns if c not in drop_for_features and c not in drop_cols]

In [0]:
string_features = []
other_features = ['unique_levels_played', 'market_idx','dayofweek','rounds_played', 'avg_attempts', 'total_attempts', 'avg_moves', 'win_rate', 'assist_success_rate', 'unassist_success_rate', 'assist_rate', 'total_boosters_used', 'total_boosters_spent', 'used_boosters_rate', 'spend_boosters_rate', 'avg_difficulty_score', 'rate_hard_levels', 'rate_superhard_levels', 'min_room_id_int', 'max_room_id_int', 'daily_win_rate_ref', 'daily_avg_boosters_used_ref', 'daily_avg_boosters_spent_ref', 'attribution_source_cd_idx', 'country_cd_idx', 'payer_type_cd_idx', 'iap_lifetime_amt', 'days_since_install', 'days_since_last_purchase', 'ad_revenue_amt', 'iap_revenue_amt', 'session_qty', 'total_session_length_qty', 'avg_session_length', 'sessions_per_round', 'avg_population_wr_on_levels_played_today', 'avg_population_assisted_rate_today', 'avg_population_attempts_today', 'wr_diff_vs_population', 'attempts_diff_vs_population', 'assist_rate_diff_vs_population', 'active_days_l7d', 'total_rounds_l7d', 'avg_rounds_l7d', 'avg_win_rate_l7d', 'avg_attempts_l7d', 'boosters_used_l7d', 'boosters_spent_l7d', 'avg_used_boosters_rate_l7d', 'active_days_l14d', 'avg_rounds_l14d', 'avg_win_rate_l14d', 'std_rounds_l14d', 'std_win_rate_l14d', 'active_days_l30d', 'avg_rounds_l30d', 'rounds_trend_weekly', 'win_rate_trend_weekly', 'boosters_usage_trend_weekly', 'rounds_ratio_7d_vs_14_7d', 'frequency_ratio_7d_vs_14d', 'levels_progressed_l7d', 'levels_progressed_l14d', 'levels_progressed_l30d', 'days_on_current_max_level', 'level_diversity_ratio',] 


In [0]:
def get_safe_works_repartition(df):

    conf = spark.sparkContext.getConf()
    cores_per_exec = int(conf.get("spark.executor.cores", "1"))
    # executors = all JVMs except the driver
    num_exec = spark._jsc.sc().getExecutorMemoryStatus().size() - 1
    slots = __builtins__.max(1, cores_per_exec * __builtins__.max(1, num_exec))

    safe_workers = __builtins__.max(1, __builtins__.min(slots, 32))  # cap if you like
    df = df.repartition(safe_workers)  # match partitions to workers

    return df, safe_workers

In [0]:
# if num_workers > available slots, fitting fails
# determine number of workers and repartition the training data
strat_train, safe_workers = get_safe_works_repartition(strat_train)
strat_train_up, _ = get_safe_works_repartition(strat_train_up)
strat_train_under, _ = get_safe_works_repartition(strat_train_under)

In [0]:
print(safe_workers)

# Build Pipeline

In [0]:
# For XGBoost we don't need to standarize any features
indexers = [StringIndexer(inputCol=x, 
                          outputCol=x+"_index", 
                          handleInvalid="keep") for x in string_features]
indexed_cols = [ x+"_index" for x in string_features]

inputs = other_features + indexed_cols

vec_assembler = VectorAssembler(inputCols=inputs, outputCol='features', handleInvalid='keep')


# Now add the xgb model to the pipeline
#eval_metrics = ["auc", "aucpr", "logloss"]
eval_metrics = ["logloss"]

xgb = SparkXGBClassifier(
  features_col = "features",
  label_col = "label",
  num_workers = safe_workers,
  eval_metric = eval_metrics,
)

# Set the pipeline stages for the entire process
pipeline = Pipeline().setStages(indexers+[vec_assembler]+ [xgb])

You can fit your pipeline model here with MLFlow tracking...

In [0]:
# Param specs for random grid builder
spec = {
    "n_estimators": ("int_uniform", 50, 1000),
    "max_depth":  ("int_uniform", 4, 50),
    "gamma": ("uniform", 0.0, 0.2),
    "learning_rate": ("uniform", 0.01,0.5),
    "subsample": ("uniform", 0.7, 0.9),
    "colsample_bytree": ("uniform", 0.7, 0.9),
    "min_child_weight": ("int_uniform", 1, 5),
    "reg_alpha": ("uniform", 0.0, 0.1),
    "reg_lambda": ("int_uniform", 1, 10),
    "colsample_bytree": ("uniform", 0, 0.6),
    "colsample_bylevel": ("uniform", 0, 0.6),
}

# build random xgb param map
xgb_param_maps = build_random_param_maps(xgb, spec, n_samples=40, seed=7)


cv_xgb = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=xgb_param_maps,
    numFolds=2,
    seed=7,
)


In [0]:
import logging

# Set the MLflow logging level to INFO
logger = logging.getLogger("mlflow")
logger.setLevel(logging.INFO)


In [0]:
# Training on default xgb pipeline with upsampling
run_info_cv_upsampled = run_spark_ml_training( 
    estimator = cv_xgb,
    train_df = strat_train_up,
    test_df = strat_test,
    val_df = strat_val,     # prefer tuning on validation
    run_name = "spark-ml-search-xgb-up-fit-new-path",
    extra_tags = {'up_sampled':True,
                  'under_sampled':False,
                  'dataset':'small',
                  'CV':True, 
                  'num_workers':safe_workers},
)

In [0]:
import mlflow
from pyspark.sql.functions import struct, col

model_uri = 'runs:/ffc34363940c4961860521630263d572/models/best_model'

# Load model as a Spark UDF. Override result_type if the model does not return double values.
loaded_model = mlflow.pyfunc.spark_udf(spark, model_uri=model_uri)


In [0]:

# Predict on a Spark DataFrame.
#df.withColumn('predictions', loaded_model(struct(*map(col, df.columns))))

In [0]:
run_info_cv_upsampled

Cross Validation

In [0]:
#import mlflow
#from pyspark.sql.functions import struct, col

#model_uri = 'runs:/63d87850c1f046ef94b5075393b0a9bf/estimator_spark_model'

# Load model as a Spark UDF. Override result_type if the model does not return double values.
#loaded_model = mlflow.pyfunc.spark_udf(spark, model_uri=model_uri)



In [0]:
#loaded_model.__dict__

In [0]:
# Predict on a Spark DataFrame.
#df.withColumn('predictions', loaded_model(struct(*map(col, df.columns))))

In [0]:
#print(run_info_cv_upsampled)

In [0]:
# Example using run_spark_cv_with_logging_spark_only() function to train a cross validation pipeline
# This will take a long time to run

"""
paramGrid = (ParamGridBuilder()
             .addGrid(xgb.reg_alpha,[1e-5, 1e-2, 0.1])
             .addGrid(xgb.reg_lambda,[1e-5, 1e-2, 0.1])
             .addGrid(xgb.gamma, [i/10.0 for i in range(0,2)])
             .addGrid(xgb.n_estimators,[10,500,20])
             #.addGrid(xgb.learning_rate,[0.01,0.1])
             .addGrid(xgb.max_depth, range(4,50))
             #.addGrid(xgb.min_child_weight, [3.0, 4.0])
             #.addGrid(xgb.colsample_bytree, [i/10.0 for i in range(3,6)])
             #.addGrid(xgb.colsample_bylevel, [i/10.0 for i in range(3,6)])
             .build())


#TODO: Figure out how the evaluator is handled in the run_spark_cv_with_logging_spark_only()
cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=paramGrid, 
                    numFolds=3)
"""

#run_info = run_spark_cv_with_logging_spark_only(
#    estimator = cv,
#    train_df = strat_train_under,
#    test_df = strat_test,
#    val_df = strat_val,     # prefer tuning on validation
#    run_name = "spark-ml-search-xgb-under-cv",
#    extra_tags = {'under_sampled':True,"cv":True}


#best_model =  cv.bestModel