In [0]:
%sql
desc teams.data_science.pp_churn_features_v3

In [0]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import DataFrame, functions as F, types as T, Window

import builtins
from datetime import datetime
from typing import Optional, Dict, Union, List, Tuple, Any
import math
import random


import pandas as pd
import numpy as np
import sklearn

from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
import mlflow

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector, VectorUDT
from pyspark.ml import Pipeline, PipelineModel

from pyspark.ml.tuning import CrossValidatorModel, TrainValidationSplitModel, ParamGridBuilder, CrossValidator, TrainValidationSplit
from pyspark.storagelevel import StorageLevel

import matplotlib.pyplot as plt

from pyspark.sql.functions import round 

In [0]:
from config import *

In [0]:
LABEL_COL = "churn3"

In [0]:
# Get data from table
spark.sql(f"select * from {FEATURES_TABLE_NAME}").withColumn("label",col(LABEL_COL)).createOrReplaceTempView("dataset")

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW VO_EVAL AS
SELECT * FROM dataset where DATE BETWEEN '2025-09-22'  AND '2025-10-13';

CREATE OR REPLACE TEMP VIEW VO_TRAIN AS
SELECT * FROM dataset where DATE BETWEEN DATE '2025-09-21' - INTERVAL 3 Months  AND '2025-09-21';

In [0]:
strat_train = spark.table("VO_TRAIN")
strat_val = spark.table("VO_EVAL")

In [0]:
# Set ML Flow experiment
mlflow.set_experiment(EXPERIMENT_NAME)

In [0]:
string_features = []
other_features = ['unique_levels_played', 'market_idx','dayofweek','rounds_played', 'avg_attempts', 'total_attempts', 'avg_moves', 'win_rate', 'assist_success_rate', 'unassist_success_rate', 'assist_rate', 'total_boosters_used', 'total_boosters_spent', 'used_boosters_rate', 'spend_boosters_rate', 'avg_difficulty_score', 'rate_hard_levels', 'rate_superhard_levels', 'min_room_id_int', 'max_room_id_int', 'daily_win_rate_ref', 'daily_avg_boosters_used_ref', 'daily_avg_boosters_spent_ref', 'attribution_source_cd_idx', 'country_cd_idx', 'payer_type_cd_idx', 'iap_lifetime_amt', 'days_since_install', 'days_since_last_purchase', 'ad_revenue_amt', 'iap_revenue_amt', 'session_qty', 'total_session_length_qty', 'avg_session_length', 'sessions_per_round', 'avg_population_wr_on_levels_played_today', 'avg_population_assisted_rate_today', 'avg_population_attempts_today', 'wr_diff_vs_population', 'attempts_diff_vs_population', 'assist_rate_diff_vs_population', 'active_days_l7d', 'total_rounds_l7d', 'avg_rounds_l7d', 'avg_win_rate_l7d', 'avg_attempts_l7d', 'boosters_used_l7d', 'boosters_spent_l7d', 'avg_used_boosters_rate_l7d', 'active_days_l14d', 'avg_rounds_l14d', 'avg_win_rate_l14d', 'std_rounds_l14d', 'std_win_rate_l14d', 'active_days_l30d', 'avg_rounds_l30d', 'rounds_trend_weekly', 'win_rate_trend_weekly', 'boosters_usage_trend_weekly', 'rounds_ratio_7d_vs_14_7d', 'frequency_ratio_7d_vs_14d', 'levels_progressed_l7d', 'levels_progressed_l14d', 'levels_progressed_l30d', 'days_on_current_max_level', 'level_diversity_ratio',] 


In [0]:
train_total = strat_train.count()
train_churn = strat_train.filter(col('label') == 1).count()
train_no_churn = train_total - train_churn

churn_ratio = train_churn / train_total
print(f"Churn ratio en train: {churn_ratio:.2%}")

#Calcular pesos balanceados
# Formula: weight = n_samples / (n_classes * n_samples_class)
weight_churn = train_total / (2 * train_churn)
weight_no_churn = train_total / (2 * train_no_churn)

print(f"Weight para churn=1: {weight_churn:.2f}")
print(f"Weight para churn=0: {weight_no_churn:.2f}")

# 3. Agregar columna weight SOLO a train
strat_train_weighted = strat_train.withColumn('weight', 
    when(col('label') == 1, weight_churn)
    .otherwise(weight_no_churn)
)

In [0]:
#### Logistic Regression Pipeline

#Prepare Data
from pyspark.ml.feature import Imputer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
imputer = Imputer(inputCols=other_features, outputCols=other_features).setStrategy("mean")
assembler = VectorAssembler(inputCols=other_features, outputCol="features_raw")
scaler = StandardScaler(inputCol="features_raw", outputCol="features", withMean=True, withStd=True)


# Add classifier
eval_metrics = ["auc", "aucpr", "logloss"]

lr = LogisticRegression(
    featuresCol='features',
    labelCol='label',
    family='binomial',
    weightCol='weight'
)


lr_pipeline = Pipeline(stages=[imputer, assembler, scaler, lr])


# lr_grid = (ParamGridBuilder()
#     .addGrid(lr.regParam, [1e-5, 1e-4, 1e-3, 1e-2, 0.1]) 
#     .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
#     .addGrid(lr.maxIter, [100, 200])  
#     .addGrid(lr.fitIntercept, [True, False]) 
#     .build())

lr_grid = (ParamGridBuilder()
    .addGrid(lr.regParam, [0.001]) 
    .addGrid(lr.elasticNetParam, [1.0])
    .addGrid(lr.maxIter, [100])  
    .addGrid(lr.fitIntercept, [True]) 
    .build())


evaluator = BinaryClassificationEvaluator(
    labelCol='label',
    metricName='areaUnderPR'  # Precision-Recall AUC
)

lr_cv = CrossValidator(
    estimator=lr_pipeline,
    estimatorParamMaps=lr_grid,
    evaluator=evaluator,
    numFolds=3,
    parallelism=2,
    seed=42
)


In [0]:
import mlflow
import mlflow.spark
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Spark
from pyspark.sql import functions as F
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# MLflow
import mlflow

# Data handling / mÃ©tricas
import pandas as pd
from sklearn.metrics import confusion_matrix, log_loss

# VisualizaciÃ³n
import seaborn as sns
import matplotlib.pyplot as plt

from pyspark.sql import functions as F
from pyspark.ml.linalg import DenseVector
from pyspark.sql.types import DoubleType

mlflow.spark.autolog()

# # Entrenamiento
# lr_model = lr_cv.fit(strat_train_weighted.persist(StorageLevel.MEMORY_AND_DISK))
# best_lr = lr_model.bestModel



In [0]:
import mlflow
import mlflow.spark

with mlflow.start_run(run_name="logistic_regression_churn_TEST") as run:
    # Entrenamiento
    lr_model = lr_cv.fit(strat_train_weighted.persist(StorageLevel.MEMORY_AND_DISK))
    best_lr = lr_model.bestModel

    # Loguear el modelo entrenado en MLflow
    mlflow.spark.log_model(best_lr, artifact_path="model")

    # Predicciones
    predictions = best_lr.transform(strat_val)

    # Evaluaciones
    evaluator_roc = BinaryClassificationEvaluator(labelCol="churn3", rawPredictionCol="probability", metricName="areaUnderROC")
    auc = evaluator_roc.evaluate(predictions)
    mlflow.log_metric("auc_roc", auc)

    evaluator_pr = BinaryClassificationEvaluator(labelCol="churn3", rawPredictionCol="probability", metricName="areaUnderPR")
    auc_pr = evaluator_pr.evaluate(predictions)
    mlflow.log_metric("auc_pr", auc_pr)

    # =======================================================
    # 4. Extraer prob_churn del DenseVector
    # =======================================================


    @F.udf(DoubleType())
    def extract_prob_churn(v):
        if v is None:
            return None
        return float(v[1]) if isinstance(v, DenseVector) else float(v.values[1])

    pred_with_prob = predictions.withColumn("prob_churn", extract_prob_churn(F.col("probability")))

    pred_with_prob = predictions.withColumn("prob_churn", extract_prob_churn(F.col("probability")))

    # =======================================================
    # 5. Calcular mÃ©tricas
    # =======================================================
    # AUC ROC y AUC PR
    evaluator_roc = BinaryClassificationEvaluator(labelCol="churn3", rawPredictionCol="probability", metricName="areaUnderROC")
    auc = evaluator_roc.evaluate(predictions)

    evaluator_pr = BinaryClassificationEvaluator(labelCol="churn3", rawPredictionCol="probability", metricName="areaUnderPR")
    auc_pr = evaluator_pr.evaluate(predictions)

    # TP, FP, FN, TN
    tp = pred_with_prob.filter((F.col("churn3") == 1) & (F.col("prediction") == 1)).count()
    fp = pred_with_prob.filter((F.col("churn3") == 0) & (F.col("prediction") == 1)).count()
    fn = pred_with_prob.filter((F.col("churn3") == 1) & (F.col("prediction") == 0)).count()
    tn = pred_with_prob.filter((F.col("churn3") == 0) & (F.col("prediction") == 0)).count()

    precision = tp / (tp + fp + 1e-9)
    recall = tp / (tp + fn + 1e-9)
    f1 = 2 * precision * recall / (precision + recall + 1e-9)
    accuracy = (tp + tn) / (tp + tn + fp + fn + 1e-9)

    # LogLoss (usa la prob_churn)
    pdf = pred_with_prob.select("churn3", "prob_churn").toPandas()
    logloss = log_loss(pdf["churn3"], pdf["prob_churn"])

    # =======================================================
    # 6. Matriz de confusiÃ³n (visual)
    # =======================================================
    cm = confusion_matrix(pdf["churn3"], (pdf["prob_churn"] >= 0.5).astype(int))

    fig, ax = plt.subplots(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax,
                xticklabels=["Pred No Churn", "Pred Churn"],
                yticklabels=["Real No Churn", "Real Churn"])
    plt.title("Confusion Matrix (threshold=0.5)")
    plt.xlabel("PredicciÃ³n")
    plt.ylabel("Real")

    # =======================================================
    # 7. Loguear todo dentro del mismo run activo
    # =======================================================
    mlflow.log_metric("auc_roc", auc)
    mlflow.log_metric("auc_pr", auc_pr)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("log_loss", logloss)
    mlflow.log_figure(fig, "confusion_matrix.png")

    print(f"""
    ðŸ“Š MÃ©tricas en Test:
    AUC ROC:   {auc:.4f}
    AUC PR:    {auc_pr:.4f}
    Precision: {precision:.4f}
    Recall:    {recall:.4f}
    F1-score:  {f1:.4f}
    Accuracy:  {accuracy:.4f}
    LogLoss:   {logloss:.4f}
    """)

    print(f"âœ… Run ID: {run.info.run_id}")

In [0]:
# # =======================================================
# # 3. Predicciones sobre test
# # =======================================================
# predictions = best_lr.transform(strat_val)

# # =======================================================
# # 4. Extraer prob_churn del DenseVector
# # =======================================================


# @F.udf(DoubleType())
# def extract_prob_churn(v):
#     if v is None:
#         return None
#     return float(v[1]) if isinstance(v, DenseVector) else float(v.values[1])

# pred_with_prob = predictions.withColumn("prob_churn", extract_prob_churn(F.col("probability")))

# pred_with_prob = predictions.withColumn("prob_churn", extract_prob_churn(F.col("probability")))

# # =======================================================
# # 5. Calcular mÃ©tricas
# # =======================================================
# # AUC ROC y AUC PR
# evaluator_roc = BinaryClassificationEvaluator(labelCol="churn3", rawPredictionCol="probability", metricName="areaUnderROC")
# auc = evaluator_roc.evaluate(predictions)

# evaluator_pr = BinaryClassificationEvaluator(labelCol="churn3", rawPredictionCol="probability", metricName="areaUnderPR")
# auc_pr = evaluator_pr.evaluate(predictions)

# # TP, FP, FN, TN
# tp = pred_with_prob.filter((F.col("churn3") == 1) & (F.col("prediction") == 1)).count()
# fp = pred_with_prob.filter((F.col("churn3") == 0) & (F.col("prediction") == 1)).count()
# fn = pred_with_prob.filter((F.col("churn3") == 1) & (F.col("prediction") == 0)).count()
# tn = pred_with_prob.filter((F.col("churn3") == 0) & (F.col("prediction") == 0)).count()

# precision = tp / (tp + fp + 1e-9)
# recall = tp / (tp + fn + 1e-9)
# f1 = 2 * precision * recall / (precision + recall + 1e-9)
# accuracy = (tp + tn) / (tp + tn + fp + fn + 1e-9)

# # LogLoss (usa la prob_churn)
# pdf = pred_with_prob.select("churn3", "prob_churn").toPandas()
# logloss = log_loss(pdf["churn3"], pdf["prob_churn"])

# # =======================================================
# # 6. Matriz de confusiÃ³n (visual)
# # =======================================================
# cm = confusion_matrix(pdf["churn3"], (pdf["prob_churn"] >= 0.5).astype(int))

# fig, ax = plt.subplots(figsize=(5,4))
# sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax,
#             xticklabels=["Pred No Churn", "Pred Churn"],
#             yticklabels=["Real No Churn", "Real Churn"])
# plt.title("Confusion Matrix (threshold=0.5)")
# plt.xlabel("PredicciÃ³n")
# plt.ylabel("Real")

# # =======================================================
# # 7. Loguear todo dentro del mismo run activo
# # =======================================================
# mlflow.log_metric("auc_roc", auc)
# mlflow.log_metric("auc_pr", auc_pr)
# mlflow.log_metric("precision", precision)
# mlflow.log_metric("recall", recall)
# mlflow.log_metric("f1_score", f1)
# mlflow.log_metric("accuracy", accuracy)
# mlflow.log_metric("log_loss", logloss)
# mlflow.log_figure(fig, "confusion_matrix.png")

# print(f"""
# ðŸ“Š MÃ©tricas en Test:
# AUC ROC:   {auc:.4f}
# AUC PR:    {auc_pr:.4f}
# Precision: {precision:.4f}
# Recall:    {recall:.4f}
# F1-score:  {f1:.4f}
# Accuracy:  {accuracy:.4f}
# LogLoss:   {logloss:.4f}
# """)