In [0]:
# %pip install mlflow xgboost


# %load_ext autoreload
# %autoreload 2
# # Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# # To disable autoreload; run %autoreload 0

# %restart_python

In [0]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import DataFrame, functions as F, types as T, Window

import builtins
from datetime import datetime
from typing import Optional, Dict, Union, List, Tuple, Any
import math
import random


import pandas as pd
import numpy as np
import sklearn

from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
import mlflow

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.feature import Imputer
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector, VectorUDT
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.tuning import CrossValidatorModel, TrainValidationSplitModel
from pyspark.storagelevel import StorageLevel

import matplotlib.pyplot as plt

from pyspark.sql.functions import round 

In [0]:
from config import *
from sampling import *
from tracking import *

In [0]:
LABEL_COL = "churn3"
LABEL_COL7 = "churn7"
LABEL_COL14 = "churn14"


DATE_FILTER = "2025-10-17"
DATE_INTERVAL = 90

FEATURES_TABLE_NAME = "teams.data_science.pp_churn_features_v2"

# These are loaded in config already
#EXPERIMENT_NAME = "/Users/krista@jamcity.com/PP-Churn-Model"
#FEATURES_TABLE_NAME = "teams.data_science.pp_churn_features"

In [0]:
df = spark.sql(f"select * from {FEATURES_TABLE_NAME}")

dates = df.agg(F.min("date").alias("min_date"), F.max("date").alias("max_date")).first()
past_need, past_need_max = 30, 3
valid_start = F.date_add(F.lit(dates["min_date"]), past_need)
valid_end = F.date_sub(F.lit(dates["max_date"]), past_need_max)

df = df.filter((F.col("date") >= valid_start) & (F.col("date") < valid_end))\
    .withColumn(LABEL_COL, when(col(LABEL_COL) == False, 0).otherwise(1))\
    .withColumn("label",col(LABEL_COL))

In [0]:
# # Muestreo estratificado manteniendo proporción de churn3
# sample_size = 10000
# total_count = df.count()
# sample_fraction = sample_size / total_count

# df_sample = df.sampleBy("churn3", fractions={0: sample_fraction, 1: sample_fraction}, seed=42)

# # Verifica las proporciones
# df_sample.groupBy("churn3").count().show()

# df = df_sample

In [0]:
# Get stratified train, validation, test set
strat_train, strat_val, strat_test = stratified_sampling(df, P_TEST=0.2, P_VAL=0.2)

In [0]:
# Undersample majority class
strat_train_under, train_under_info = undersample_majority(df)

In [0]:
#Upsample minority class
strat_train_up, train_up_info = upsample_minority(df)

Build Pipeline for classification

In [0]:
# PA 2025-10-24: temporary, remove later. -->
EXPERIMENT_NAME = "/Users/gpereyra@jamcity.com/pp-churn/churn-models/MLFlow-tracking"
# PA 2025-10-24: temporary, remove later. <--
mlflow.set_experiment(EXPERIMENT_NAME)

In [0]:
#TODO: would love to have a function that automatically sorts the columns by type
#drop_for_features = {"judi","date","churn3"} 
#feature_cols = [c for c in df.columns if c not in drop_for_features and c not in drop_cols]


In [0]:
string_features = [] # add back market_idx or whatever indexed market column
other_features = ['unique_levels_played', 'rounds_played', 'avg_attempts', 'total_attempts', 'avg_moves', 'win_rate', 'assist_success_rate', 'unassist_success_rate', 'assist_rate', 'total_boosters_used', 'total_boosters_spent', 'used_boosters_rate', 'spend_boosters_rate', 'avg_difficulty_score', 'rate_hard_levels', 'rate_superhard_levels', 'min_room_id_int', 'max_room_id_int', 'daily_win_rate_ref', 'daily_avg_boosters_used_ref', 'daily_avg_boosters_spent_ref', 'attribution_source_cd_idx', 'country_cd_idx', 'payer_type_cd_idx', 'iap_lifetime_amt', 'days_since_install', 'days_since_last_purchase', 'ad_revenue_amt', 'iap_revenue_amt', 'session_qty', 'total_session_length_qty', 'avg_session_length', 'sessions_per_round', 'avg_population_wr_on_levels_played_today', 'avg_population_assisted_rate_today', 'avg_population_attempts_today', 'wr_diff_vs_population', 'attempts_diff_vs_population', 'assist_rate_diff_vs_population', 'active_days_l7d', 'total_rounds_l7d', 'avg_rounds_l7d', 'avg_win_rate_l7d', 'avg_attempts_l7d', 'boosters_used_l7d', 'boosters_spent_l7d', 'avg_used_boosters_rate_l7d', 'active_days_l14d', 'avg_rounds_l14d', 'avg_win_rate_l14d', 'std_rounds_l14d', 'std_win_rate_l14d', 'active_days_l30d', 'avg_rounds_l30d', 'rounds_trend_weekly', 'win_rate_trend_weekly', 'boosters_usage_trend_weekly', 'rounds_ratio_7d_vs_14_7d', 'frequency_ratio_7d_vs_14d', 'levels_progressed_l7d', 'levels_progressed_l14d', 'levels_progressed_l30d', 'days_on_current_max_level', 'level_diversity_ratio',] 


In [0]:
def get_safe_works_repartition(df):

    conf = spark.sparkContext.getConf()
    cores_per_exec = int(conf.get("spark.executor.cores", "1"))
    # executors = all JVMs except the driver
    num_exec = spark._jsc.sc().getExecutorMemoryStatus().size() - 1
    slots = __builtins__.max(1, cores_per_exec * __builtins__.max(1, num_exec))

    safe_workers = __builtins__.max(1, __builtins__.min(slots, 32))  # cap if you like
    df = df.repartition(safe_workers)  # match partitions to workers

    return df, safe_workers

In [0]:
# if num_workers > available slots, fitting fails
# determine number of workers and repartition the training data
strat_train, safe_workers = get_safe_works_repartition(strat_train)
strat_train_up, _ = get_safe_works_repartition(strat_train_up)
strat_train_under, _ = get_safe_works_repartition(strat_train_under)

# Build Pipeline

In [0]:
#### Logistic Regression Pipeline

#Prepare Data
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=other_features, outputCols=other_features).setStrategy("mean")
assembler = VectorAssembler(inputCols=other_features, outputCol="features_raw")
scaler = StandardScaler(inputCol="features_raw", outputCol="features", withMean=True, withStd=True)


# Add classifier
eval_metrics = ["auc", "aucpr", "logloss"]

lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
)

lr_pipeline = Pipeline(stages=[imputer, assembler, scaler, lr])


In [0]:
spark.sparkContext.setLogLevel("INFO")

In [0]:

# --- Grid de búsqueda ---
# lr_grid = (ParamGridBuilder()
#            .addGrid(lr.regParam, [1e-4, 1e-3, 1e-2, 1e-1])
#            .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
#            .addGrid(lr.maxIter, [25, 50, 100 , 150])
#            .build())


# lr_grid = (ParamGridBuilder()
#     .addGrid(lr.regParam, [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2])  # 7 valores
#     .addGrid(lr.elasticNetParam, [0.5, 0.7, 0.8, 0.9, 1.0])  # 5 valores
#     .addGrid(lr.maxIter, [100, 150])  # 2 valores
#     .addGrid(lr.threshold, [0.3, 0.35, 0.4, 0.45, 0.5])  # 5 valores
#     .build())

lr_grid = (ParamGridBuilder()
    .addGrid(lr.regParam, [1e-5, 1e-3, 1e-2])
    .addGrid(lr.elasticNetParam, [0.5, 1.0])
    .addGrid(lr.maxIter, [100])
    .addGrid(lr.threshold, [0.3, 0.35, 0.4, 0.45, 0.5])
    .build())

evaluator_pr = BinaryClassificationEvaluator(metricName = "areaUnderPR")

lr_cv = CrossValidator(
    estimator=lr_pipeline,
    estimatorParamMaps=lr_grid,
    evaluator=evaluator_pr,
    numFolds=2,
    parallelism=2,
    seed=42,
    collectSubModels=True
)


# lr_cv = CrossValidator(
#     estimator=lr_pipeline,
#     estimatorParamMaps=lr_grid,
#     evaluator= evaluator_pr,
#     numFolds=3,
#     parallelism=8,
#     seed=42
# )

In [0]:
# Training on default xgb pipeline with upsampling
run_info = run_spark_cv_with_logging_spark_only( 
    estimator = lr_cv,
    train_df = strat_train_up,
    test_df = strat_test,
    val_df = strat_val,     # prefer tuning on validation
    run_name = "spark-ml-search-lr-cv-up",
    extra_tags = {'up_sampled':True},
)

You can fit your pipeline model here with MLFlow tracking...

Cross Validation

In [0]:
# Example using run_spark_cv_with_logging_spark_only() function to train a cross validation pipeline
# This will take a long time to run

"""
paramGrid = (ParamGridBuilder()
             .addGrid(xgb.reg_alpha,[1e-5, 1e-2, 0.1])
             .addGrid(xgb.reg_lambda,[1e-5, 1e-2, 0.1])
             .addGrid(xgb.gamma, [i/10.0 for i in range(0,2)])
             .addGrid(xgb.n_estimators,[10,500,20])
             #.addGrid(xgb.learning_rate,[0.01,0.1])
             .addGrid(xgb.max_depth, range(4,50))
             #.addGrid(xgb.min_child_weight, [3.0, 4.0])
             #.addGrid(xgb.colsample_bytree, [i/10.0 for i in range(3,6)])
             #.addGrid(xgb.colsample_bylevel, [i/10.0 for i in range(3,6)])
             .build())


#TODO: Figure out how the evaluator is handled in the run_spark_cv_with_logging_spark_only()
cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=paramGrid, 
                    numFolds=3)
"""

#run_info = run_spark_cv_with_logging_spark_only(
#    estimator = cv,
#    train_df = strat_train_under,
#    test_df = strat_test,
#    val_df = strat_val,     # prefer tuning on validation
#    run_name = "spark-ml-search-xgb-under-cv",
#    extra_tags = {'under_sampled':True,"cv":True}


#best_model =  cv.bestModel

In [0]:
import mlflow
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# ----------------------------------------------------------
# Configurar MLflow para usar tu experimento existente
# ----------------------------------------------------------
mlflow.set_experiment(experiment_id="493970548959259")

with mlflow.start_run(run_name="logreg-cv-run") as run:
    lr_cv = CrossValidator(
        estimator=lr_pipeline,
        estimatorParamMaps=lr_grid,
        evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR"),
        numFolds=2,
        parallelism=2
    )

    cv_model = lr_cv.fit(strat_train_up)
    best_model = cv_model.bestModel

    scored = best_model.transform(strat_val)

    evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="probability")
    auc_roc = evaluator.evaluate(scored, {evaluator.metricName: "areaUnderROC"})
    auc_pr = evaluator.evaluate(scored, {evaluator.metricName: "areaUnderPR"})

    mlflow.log_metric("auc_roc", auc_roc)
    mlflow.log_metric("auc_pr", auc_pr)
    mlflow.spark.log_model(best_model, "spark_model")

    print(f"Run loggeado en MLflow con ID: {run.info.run_id}")

In [0]:
import mlflow
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# ----------------------------------------------------------
# Configurar MLflow para usar tu experimento existente
# ----------------------------------------------------------
mlflow.set_experiment(experiment_id="493970548959259")

with mlflow.start_run(run_name="logreg-cv-run") as run:
    lr_cv = CrossValidator(
        estimator=lr_pipeline,
        estimatorParamMaps=lr_grid,
        evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR"),
        numFolds=2,
        parallelism=2,
        collectSubModels=True
    )

    cv_model = lr_cv.fit(strat_train_up)
    best_model = cv_model.bestModel

    scored = best_model.transform(strat_val)

    evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="probability")
    auc_roc = evaluator.evaluate(scored, {evaluator.metricName: "areaUnderROC"})
    auc_pr = evaluator.evaluate(scored, {evaluator.metricName: "areaUnderPR"})

    mlflow.log_metric("auc_roc", auc_roc)
    mlflow.log_metric("auc_pr", auc_pr)
    mlflow.spark.log_model(best_model, "spark_model")

    print(f"Run loggeado en MLflow con ID: {run.info.run_id}")