In [0]:
from pyspark.sql.functions import col

In [0]:
LABEL_COL = "churn3"
DATE_FILTER = "2025-10-17"
DATE_INTERVAL = 90

DEBUG = False
FEATURES_TABLE_NAME = "teams.data_science.pp_churn_features_v3"

if DEBUG:
    FEATURES_TABLE_NAME = "teams.data_science.pp_churn_features_v3_small"

In [0]:
# Get data from table
churn_features = spark.sql(f"select * from {FEATURES_TABLE_NAME}")\
    .withColumn("label",col(LABEL_COL))

In [0]:
# Muestreo estratificado manteniendo proporciÃ³n de churn3
sample_size = 100000
total_count = churn_features.count()
sample_fraction = min(1.0, sample_size / total_count)

df_sample = churn_features.sampleBy("churn3", fractions={0: sample_fraction, 1: sample_fraction}, seed=42)

# Verifica las proporciones
df_sample.groupBy("churn3").count().show()

churn_features = df_sample

In [0]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import DataFrame, functions as F, types as T, Window

import builtins
from datetime import datetime
from typing import Optional, Dict, Union, List, Tuple, Any
import math
import random


import pandas as pd
import numpy as np
import sklearn

from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
import mlflow

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector, VectorUDT
from pyspark.ml import Pipeline, PipelineModel

from pyspark.ml.tuning import CrossValidatorModel, TrainValidationSplitModel, ParamGridBuilder, CrossValidator, TrainValidationSplit
from pyspark.storagelevel import StorageLevel

import matplotlib.pyplot as plt

from pyspark.sql.functions import round 

In [0]:
from config import *
from sampling import *
# from tracking import *
#from Deprecated.deprecated_tracking import *

In [0]:
# Get stratified train, validation, test set
strat_train, strat_val, strat_test = stratified_sampling(churn_features, P_TEST=0.2, P_VAL=0.2)

In [0]:
# Undersample majority class
# strat_train_under, train_under_info = undersample_majority(churn_features)

In [0]:
#Upsample minority class
# strat_train_up, train_up_info = upsample_minority(churn_features)

Build Pipeline for classification

In [0]:
# Set ML Flow experiment
mlflow.set_experiment(EXPERIMENT_NAME)

In [0]:
#TODO: would love to have a function that automatically sorts the columns by type - dynamic column selection/preprocessing
#drop_for_features = {"judi","date","churn3"} 
#feature_cols = [c for c in df.columns if c not in drop_for_features and c not in drop_cols]

In [0]:
string_features = []
other_features = ['unique_levels_played', 'market_idx','dayofweek','rounds_played', 'avg_attempts', 'total_attempts', 'avg_moves', 'win_rate', 'assist_success_rate', 'unassist_success_rate', 'assist_rate', 'total_boosters_used', 'total_boosters_spent', 'used_boosters_rate', 'spend_boosters_rate', 'avg_difficulty_score', 'rate_hard_levels', 'rate_superhard_levels', 'min_room_id_int', 'max_room_id_int', 'daily_win_rate_ref', 'daily_avg_boosters_used_ref', 'daily_avg_boosters_spent_ref', 'attribution_source_cd_idx', 'country_cd_idx', 'payer_type_cd_idx', 'iap_lifetime_amt', 'days_since_install', 'days_since_last_purchase', 'ad_revenue_amt', 'iap_revenue_amt', 'session_qty', 'total_session_length_qty', 'avg_session_length', 'sessions_per_round', 'avg_population_wr_on_levels_played_today', 'avg_population_assisted_rate_today', 'avg_population_attempts_today', 'wr_diff_vs_population', 'attempts_diff_vs_population', 'assist_rate_diff_vs_population', 'active_days_l7d', 'total_rounds_l7d', 'avg_rounds_l7d', 'avg_win_rate_l7d', 'avg_attempts_l7d', 'boosters_used_l7d', 'boosters_spent_l7d', 'avg_used_boosters_rate_l7d', 'active_days_l14d', 'avg_rounds_l14d', 'avg_win_rate_l14d', 'std_rounds_l14d', 'std_win_rate_l14d', 'active_days_l30d', 'avg_rounds_l30d', 'rounds_trend_weekly', 'win_rate_trend_weekly', 'boosters_usage_trend_weekly', 'rounds_ratio_7d_vs_14_7d', 'frequency_ratio_7d_vs_14d', 'levels_progressed_l7d', 'levels_progressed_l14d', 'levels_progressed_l30d', 'days_on_current_max_level', 'level_diversity_ratio',] 


In [0]:
def get_safe_works_repartition(df):

    conf = spark.sparkContext.getConf()
    cores_per_exec = int(conf.get("spark.executor.cores", "1"))
    # executors = all JVMs except the driver
    num_exec = spark._jsc.sc().getExecutorMemoryStatus().size() - 1
    slots = __builtins__.max(1, cores_per_exec * __builtins__.max(1, num_exec))

    safe_workers = __builtins__.max(1, __builtins__.min(slots, 32))  # cap if you like
    df = df.repartition(safe_workers)  # match partitions to workers

    return df, safe_workers

In [0]:
# if num_workers > available slots, fitting fails
# determine number of workers and repartition the training data
strat_train, safe_workers = get_safe_works_repartition(strat_train)
# strat_train_up, _ = get_safe_works_repartition(strat_train_up)
# strat_train_under, _ = get_safe_works_repartition(strat_train_under)

In [0]:
print(safe_workers)

# Build Pipeline

In [0]:
#### Logistic Regression Pipeline

#Prepare Data
from pyspark.ml.feature import Imputer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
imputer = Imputer(inputCols=other_features, outputCols=other_features).setStrategy("mean")
assembler = VectorAssembler(inputCols=other_features, outputCol="features_raw")
scaler = StandardScaler(inputCol="features_raw", outputCol="features", withMean=True, withStd=True)


# Add classifier
eval_metrics = ["auc", "aucpr", "logloss"]

lr = LogisticRegression(
    featuresCol='features',
    labelCol='label',
    family='binomial',
)

lr_pipeline = Pipeline(stages=[imputer, assembler, scaler, lr])


You can fit your pipeline model here with MLFlow tracking...

In [0]:
lr_grid = (ParamGridBuilder()
    .addGrid(lr.regParam, [1e-5, 1e-4, 1e-3, 1e-2, 0.1]) 
    .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
    .addGrid(lr.maxIter, [100, 200])  
    .addGrid(lr.fitIntercept, [True, False]) 
    .build())

evaluator = BinaryClassificationEvaluator(
    labelCol='label',
    metricName='areaUnderPR'  # Precision-Recall AUC
)

lr_cv = CrossValidator(
    estimator=lr_pipeline,
    estimatorParamMaps=lr_grid,
    evaluator=evaluator,
    numFolds=3,
    parallelism=2,
    seed=42
)

lr_model = lr_cv.fit(strat_train.persist(StorageLevel.MEMORY_AND_DISK))
best_lr = lr_model.bestModel

In [0]:
# xgb_paramGrid = (ParamGridBuilder()
#              .addGrid(xgb.n_estimators,[10, 20])
#              #.addGrid(xgb.max_depth, range(4,50))
#              .build())


# #TODO: Figure out how the evaluator is handled in the run_spark_cv_with_logging_spark_only()
# cv_xgb = CrossValidator(estimator=pipeline, 
#                     estimatorParamMaps=xgb_paramGrid, 
#                     numFolds=3)


In [0]:
# # Training on default xgb pipeline with upsampling
# run_info_cv_upsampled = run_spark_ml_training( 
#     estimator = lr_cv,
#     train_df = strat_train,
#     test_df = strat_test,
#     val_df = strat_val,     # prefer tuning on validation
#     run_name = "spark-ml-search-lg-gp-10k-test-v3",
#     extra_tags = {'up_sampled':False,
#                   'under_sampled':False,
#                   'CV':True, 
#                   'num_workers':safe_workers},
# )

Cross Validation