In [0]:
df = spark.table("teams.data_science.pp_churn_features_v3")

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

# Tu preprocessing (ya lo tienes)
cols_to_drop = ['judi', 'date', 'churn7', 'churn14', 'subs_lifetime_amt', 
                'subs_revenue_amt', 'daily_avg_boosters_used_ref']
df_clean = (
    df.drop(*cols_to_drop)
      .withColumn("churn3", F.col("churn3").cast("int"))
      .fillna(0)
)

# Validaci√≥n b√°sica
print(f"Registros totales: {df_clean.count():,}")
print(f"Columnas: {len(df_clean.columns)}")
print(f"\nDistribuci√≥n de la clase target:")
df_clean.groupBy("churn3").count().show()

# Verificar tipos de datos
print("\nTipos de datos:")
df_clean.printSchema()

In [0]:
# Muestreo estratificado manteniendo proporci√≥n de churn3
sample_size = 1000000
total_count = df_clean.count()
sample_fraction = min(1.0, sample_size / total_count)

df_sample = df_clean.sampleBy("churn3", fractions={0: sample_fraction, 1: sample_fraction}, seed=42)

# Verifica las proporciones
df_sample.groupBy("churn3").count().show()

df_clean = df_sample

In [0]:
from pyspark.sql import Window

# Split estratificado 80/10/10 (train/val/test) sin romper el driver
# Usamos una columna aleatoria para el split
df_split = df_clean.withColumn("rand", F.rand(seed=42))

# Estratificaci√≥n manual por clase
train_df = df_split.filter(
    ((F.col("churn3") == 0) & (F.col("rand") <= 0.8)) |
    ((F.col("churn3") == 1) & (F.col("rand") <= 0.8))
).drop("rand")

val_df = df_split.filter(
    ((F.col("churn3") == 0) & (F.col("rand") > 0.8) & (F.col("rand") <= 0.9)) |
    ((F.col("churn3") == 1) & (F.col("rand") > 0.8) & (F.col("rand") <= 0.9))
).drop("rand")

test_df = df_split.filter(
    ((F.col("churn3") == 0) & (F.col("rand") > 0.9)) |
    ((F.col("churn3") == 1) & (F.col("rand") > 0.9))
).drop("rand")

# Verificar distribuci√≥n (sin .count() para no materializar todo)
print("Distribuci√≥n aproximada por split:")
print("Train:")
train_df.groupBy("churn3").count().show()
print("Validation:")
val_df.groupBy("churn3").count().show()
print("Test:")
test_df.groupBy("churn3").count().show()

# Guardar en Delta (CR√çTICO: esto evita recomputar y permite cargar en chunks)
train_df.write.option("mergeSchema", "true").mode("overwrite").format("delta").saveAsTable("teams.data_science.gp_pp_train_churn3")
val_df.write.option("mergeSchema", "true").mode("overwrite").format("delta").saveAsTable("teams.data_science.gp_pp_val_churn3")
test_df.write.option("mergeSchema", "true").mode("overwrite").format("delta").saveAsTable("teams.data_science.gp_pp_test_churn3")

print("\n‚úÖ Splits guardados en Delta")

In [0]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml import Pipeline

# Leer train desde Delta
train_df = spark.table("teams.data_science.gp_pp_train_churn3")

# Separar features de target
feature_cols = [col for col in train_df.columns if col != 'churn3']
print(f"Features a normalizar: {len(feature_cols)}")

# Pipeline de normalizaci√≥n
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_raw")
scaler = StandardScaler(inputCol="features_raw", outputCol="features_scaled", 
                        withMean=True, withStd=True)

pipeline = Pipeline(stages=[assembler, scaler])

# Fit SOLO en train (calcula mean/std)
scaler_model = pipeline.fit(train_df)

# Transformar todos los splits
train_scaled = scaler_model.transform(train_df).select("features_scaled", "churn3")
val_scaled = scaler_model.transform(spark.table("teams.data_science.gp_pp_val_churn3")).select("features_scaled", "churn3")
test_scaled = scaler_model.transform(spark.table("teams.data_science.gp_pp_test_churn3")).select("features_scaled", "churn3")

# Guardar datos normalizados
train_scaled.write.option("mergeSchema", "true").mode("overwrite").format("delta").saveAsTable("teams.data_science.gp_pp_train_churn3_scaled")
val_scaled.write.option("mergeSchema", "true").mode("overwrite").format("delta").saveAsTable("teams.data_science.gp_pp_val_churn3_scaled")
test_scaled.write.option("mergeSchema", "true").mode("overwrite").format("delta").saveAsTable("teams.data_science.gp_pp_test_churn3_scaled")



In [0]:
spark.sql("set spark.databricks.delta.retentionDurationCheck.enabled = false")

vacuum_lst = ["teams.data_science.gp_pp_train_churn3_scaled",
"teams.data_science.gp_pp_val_churn3_scaled",
"teams.data_science.gp_pp_test_churn3_scaled",
"teams.data_science.gp_pp_train_churn3",
"teams.data_science.gp_pp_val_churn3",
"teams.data_science.gp_pp_test_churn3"]

for table in vacuum_lst:
    spark.sql(f"VACUUM {table} RETAIN 0 HOURS")

In [0]:
# Guardar el scaler para producci√≥n
scaler_model.write().overwrite().save("/mnt/jc-analytics-databricks-work/analytics/gpereyra/pp_nn/churn3_scaler_model")

print("‚úÖ Normalizaci√≥n completada y guardada")
print("\nEjemplo de dato transformado:")
train_scaled.show(2, truncate=False)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.sql.functions import col, udf
import math

vector_to_array = udf(lambda v: v.toArray().tolist(), ArrayType(DoubleType()))

def write_tfrecords_dynamic(table_name, output_path, target_rows_per_file=500_000, min_files=4, max_files=200):
    """
    Convierte tabla Delta a Parquet con n√∫mero de archivos din√°mico seg√∫n tama√±o del dataset.
    """
    df = spark.table(table_name)
    
    # Contar filas
    total_rows = df.count()
    
    # Calcular n√∫mero de archivos ideal
    num_files = max(min_files, min(max_files, math.ceil(total_rows / target_rows_per_file)))
    
    # Convertir Vector a Array y seleccionar columnas relevantes
    df = df.withColumn("features", vector_to_array(col("features_scaled"))) \
           .select("features", "churn3")
    
    # Repartir seg√∫n tama√±o estimado
    df = df.repartition(num_files)
    
    # Guardar como Parquet
    df.write.mode("overwrite").parquet(output_path)
    
    print(f"‚úÖ Guardado: {output_path}")
    print(f"‚û°Ô∏è Total filas: {total_rows:,}")
    print(f"‚û°Ô∏è Archivos generados: {num_files}")
    return output_path

train_path = write_tfrecords_dynamic(
    "teams.data_science.gp_pp_train_churn3_scaled",
    f"{PREFIX_PATH}gpereyra/pp_nn/churn3_tfdata/train"
)
val_path = write_tfrecords_dynamic(
    "teams.data_science.gp_pp_val_churn3_scaled",
    f"{PREFIX_PATH}gpereyra/pp_nn/churn3_tfdata/val"
)
test_path = write_tfrecords_dynamic(
    "teams.data_science.gp_pp_test_churn3_scaled",
    f"{PREFIX_PATH}gpereyra/pp_nn/churn3_tfdata/test"
)

print("\nTodos los datasets convertidos")

In [0]:
# import tensorflow as tf
# import numpy as np
# from pyspark.sql.functions import col, udf
# from pyspark.sql.types import ArrayType, DoubleType

# # Funci√≥n para convertir Vector a Array
# vector_to_array = udf(lambda v: v.toArray().tolist(), ArrayType(DoubleType()))

# def write_tfrecords(table_name, output_path, num_files=50):
#     """
#     Convierte tabla Delta a TFRecords (distribuido, sin usar driver memory)
#     """
#     df = spark.table(table_name)
    
#     # Convertir Vector a Array
#     df = df.withColumn("features", vector_to_array(col("features_scaled"))) \
#            .select("features", "churn3")
    
#     # Repartir para escritura paralela
#     df = df.repartition(num_files)
    
#     # Guardar como Parquet primero (m√°s eficiente)
#     df.write.mode("overwrite").parquet(output_path)
    
#     print(f"‚úÖ Datos guardados en Parquet: {output_path}")
#     return output_path

# # Escribir los 3 datasets
# print("Convirtiendo a formato Parquet...")
# train_path = write_tfrecords(
#     "teams.data_science.gp_pp_train_churn3_scaled",
#     "/mnt/jc-analytics-databricks-work/analytics/gpereyra/pp_nn/churn3_tfdata/train",
#     num_files=100
# )
# val_path = write_tfrecords(
#     "teams.data_science.gp_pp_val_churn3_scaled",
#     "/mnt/jc-analytics-databricks-work/analytics/gpereyra/pp_nn/churn3_tfdata/val",
#     num_files=20
# )
# test_path = write_tfrecords(
#     "teams.data_science.gp_pp_test_churn3_scaled",
#     "/mnt/jc-analytics-databricks-work/analytics/gpereyra/pp_nn/churn3_tfdata/test",
#     num_files=20
# )

# print("\nTodos los datasets convertidos")

In [0]:
# import tensorflow as tf
# import numpy as np
# import pyarrow.parquet as pq

# def create_tf_dataset_from_parquet(parquet_path, batch_size=2048, shuffle=True):
#     """
#     Lee Parquet con TensorFlow de forma eficiente
#     """
#     # Normalizar path para dbutils
#     parquet_path_dbfs = parquet_path.replace('/dbfs', 'dbfs:')
    
#     # Listar archivos parquet
#     files = dbutils.fs.ls(parquet_path_dbfs)
#     parquet_files = [f.path for f in files if f.name.endswith('.parquet')]
    
#     print(f"Encontrados {len(parquet_files)} archivos parquet")
    
#     def generator():
#         """Lee Parquet files en batches"""
#         for file_path in parquet_files:
#             # Convertir de dbfs: a /dbfs/ para pyarrow
#             local_path = file_path.replace('dbfs:', '/dbfs')
            
#             # Leer archivo parquet
#             table = pq.read_table(local_path)
            
#             # Convertir a numpy
#             features = np.array([np.array(x, dtype=np.float32) for x in table['features'].to_pylist()])
#             labels = table['churn3'].to_numpy().astype(np.int32)
            
#             # Yield todos los rows del archivo
#             for i in range(len(features)):
#                 yield features[i], labels[i]
    
#     # Crear dataset
#     dataset = tf.data.Dataset.from_generator(
#         generator,
#         output_signature=(
#             tf.TensorSpec(shape=(63,), dtype=tf.float32),
#             tf.TensorSpec(shape=(), dtype=tf.int32)
#         )
#     )
    
#     if shuffle:
#         dataset = dataset.shuffle(buffer_size=10000, reshuffle_each_iteration=True)
    
#     dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
#     return dataset

# # Crear datasets
# print("Creando TF Datasets desde Parquet...")
# BATCH_SIZE = 2048

# train_dataset = create_tf_dataset_from_parquet("/mnt/jc-analytics-databricks-work/analytics/gpereyra/pp_nn/churn3_tfdata/train", BATCH_SIZE, shuffle=True)
# val_dataset = create_tf_dataset_from_parquet("/mnt/jc-analytics-databricks-work/analytics/gpereyra/pp_nn/churn3_tfdata/val", BATCH_SIZE, shuffle=False)

# # Calcular steps
# train_size = 20_829_767
# val_size = 2_601_368
# steps_per_epoch = train_size // BATCH_SIZE
# validation_steps = val_size // BATCH_SIZE

# print(f"\n‚úÖ Steps por epoch: {steps_per_epoch}")
# print(f"‚úÖ Validation steps: {validation_steps}")

# # TEST cr√≠tico
# print("\nüß™ Probando lectura de 1 batch...")
# for x_batch, y_batch in train_dataset.take(1):
#     print(f"‚úÖ Features shape: {x_batch.shape}")
#     print(f"‚úÖ Labels shape: {y_batch.shape}")
#     print(f"‚úÖ Distribuci√≥n labels: {np.bincount(y_batch.numpy())}")
#     print(f"‚úÖ Rango features: [{x_batch.numpy().min():.2f}, {x_batch.numpy().max():.2f}]")

In [0]:
%pip uninstall -y tensorflow tensorflow-cpu

In [0]:
%pip install --upgrade pip
%pip install "tensorflow[and-cuda]==2.17.*"

In [0]:
 %restart_python

In [0]:
import tensorflow as tf
print("GPUs:", tf.config.list_physical_devices("GPU"))
print("Built with CUDA:", tf.test.is_built_with_cuda())

In [0]:
import tensorflow as tf
print("GPUs disponibles:", tf.config.list_physical_devices('GPU'))

In [0]:
dbutils.fs.ls("/tmp/")

In [0]:
PREFIX_PATH = '/tmp/'
# PREFIX_PATH = '/mnt/jc-analytics-databricks-work/analytics/'

In [0]:
import tensorflow as tf
import numpy as np
import pyarrow.parquet as pq
import io

def create_tf_dataset_from_parquet(parquet_path, batch_size=2048, shuffle=True):
    """
    Lee Parquet con TensorFlow de forma portable (compatible con Serverless GPU)
    """
    # Parquet path debe usar prefijo dbfs:
    parquet_path_dbfs = parquet_path if parquet_path.startswith("dbfs:") else f"dbfs:{parquet_path}"
    
    # Listar archivos en la carpeta
    files = dbutils.fs.ls(parquet_path_dbfs)
    parquet_files = [f.path for f in files if f.name.endswith(".parquet")]

    print(f"üì¶ Encontrados {len(parquet_files)} archivos parquet en {parquet_path_dbfs}")

    def generator():
        """Lee Parquet files directamente desde DBFS en memoria"""
        for file_path in parquet_files:
            with dbutils.fs.open(file_path, "rb") as f:
                data = f.read()
                table = pq.read_table(io.BytesIO(data))
            
            features = np.array(
                [np.array(x, dtype=np.float32) for x in table["features"].to_pylist()]
            )
            labels = table["churn3"].to_numpy().astype(np.int32)

            for i in range(len(features)):
                yield features[i], labels[i]

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            tf.TensorSpec(shape=(65,), dtype=tf.float32),
            tf.TensorSpec(shape=(), dtype=tf.int32),
        ),
    )

    if shuffle:
        dataset = dataset.shuffle(buffer_size=10000, reshuffle_each_iteration=True)

    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset


# --------------------------------------------------------
# Crear datasets din√°micamente
# --------------------------------------------------------
print("Creando TF Datasets desde Parquet...")

BATCH_SIZE = 2048
PREFIX_PATH = "dbfs:/tmp/gpereyra/pp_nn/churn3_tfdata/"

train_path = f"{PREFIX_PATH}train"
val_path = f"{PREFIX_PATH}val"

# üîπ Contar filas din√°micamente
train_size = spark.table("teams.data_science.gp_pp_train_churn3_scaled").count()
val_size = spark.table("teams.data_science.gp_pp_val_churn3_scaled").count()

steps_per_epoch = max(1, train_size // BATCH_SIZE)
validation_steps = max(1, val_size // BATCH_SIZE)

print(f"\nüìä Filas train: {train_size:,}")
print(f"üìä Filas val: {val_size:,}")
print(f"‚úÖ Steps por epoch: {steps_per_epoch}")
print(f"‚úÖ Validation steps: {validation_steps}")

# üîπ Crear datasets
train_dataset = create_tf_dataset_from_parquet(train_path, BATCH_SIZE, shuffle=True)
val_dataset = create_tf_dataset_from_parquet(val_path, BATCH_SIZE, shuffle=False)

# TEST cr√≠tico
print("\nüß™ Probando lectura de 1 batch...")
for x_batch, y_batch in train_dataset.take(1):
    print(f"‚úÖ Features shape: {x_batch.shape}")
    print(f"‚úÖ Labels shape: {y_batch.shape}")
    print(f"‚úÖ Distribuci√≥n labels: {np.bincount(y_batch.numpy())}")
    print(f"‚úÖ Rango features: [{x_batch.numpy().min():.2f}, {x_batch.numpy().max():.2f}]")

In [0]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Calcular class weights din√°micamente desde train
train_class_counts = spark.table("teams.data_science.gp_pp_train_churn3_scaled") \
    .groupBy("churn3") \
    .count() \
    .orderBy("churn3") \
    .collect()

# Extraer counts
count_class_0 = train_class_counts[0]['count']
count_class_1 = train_class_counts[1]['count']
total = count_class_0 + count_class_1

# M√©todo 1: Inverse frequency (el que usabas)
class_weight = {
    0: 1.0,
    1: count_class_0 / count_class_1
}

# M√©todo 2: Balanced (alternativa, a veces mejor)
# class_weight = {
#     0: total / (2 * count_class_0),
#     1: total / (2 * count_class_1)
# }

print(f"Distribuci√≥n train:")
print(f"  Clase 0: {count_class_0:,} ({count_class_0/total*100:.2f}%)")
print(f"  Clase 1: {count_class_1:,} ({count_class_1/total*100:.2f}%)")
print(f"\nClass weights: {class_weight}")
print(f"  Ratio: 1:{class_weight[1]:.2f}")

In [0]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(f"Class weights: {class_weight}")

# Arquitectura simple y efectiva para clasificaci√≥n binaria
def create_model(input_dim=65, dropout_rate=0.3):
    """
    NN simple con regularizaci√≥n para evitar overfitting en desbalance
    """
    model = keras.Sequential([
        # Input
        layers.Input(shape=(input_dim,)),
        
        # Layer 1
        layers.Dense(128, activation='relu', 
                    kernel_regularizer=keras.regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(dropout_rate),
        
        # Layer 2
        layers.Dense(64, activation='relu',
                    kernel_regularizer=keras.regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(dropout_rate),
        
        # Layer 3
        layers.Dense(32, activation='relu',
                    kernel_regularizer=keras.regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(dropout_rate),
        
        # Output (sigmoid para binaria)
        layers.Dense(1, activation='sigmoid')
    ])
    
    return model

# Crear modelo
model = create_model()
model.summary()

# Compilar con m√©tricas apropiadas para desbalance
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
        keras.metrics.AUC(name='pr_auc', curve='PR')  # Precision-Recall AUC
    ]
)

print("\n‚úÖ Modelo creado y compilado")
print(f"‚úÖ Total par√°metros: {model.count_params():,}")

In [0]:
%python
dbutils.fs.ls("/tmp/gpereyra/pp_nn/churn3_tfdata/train")

In [0]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
import datetime

# Directorio para guardar modelo
model_path = f"{PREFIX_PATH}gpereyra/pp_nn/churn3_models"
dbutils.fs.mkdirs(f"dbfs:{model_path}")

BATCH_SIZE = 2048

# Callbacks
callbacks = [
    # Early stopping en validation AUC (mejor m√©trica para desbalance)
    EarlyStopping(
        monitor='val_auc',
        patience=5,
        mode='max',
        restore_best_weights=True,
        verbose=1
    ),
    
    # Guardar mejor modelo
    ModelCheckpoint(
        filepath=f"{model_path}/best_model.keras",
        monitor='val_auc',
        mode='max',
        save_best_only=True,
        verbose=1
    ),
    
    # Reducir learning rate si no mejora
    ReduceLROnPlateau(
        monitor='val_auc',
        factor=0.5,
        patience=3,
        mode='max',
        min_lr=1e-6,
        verbose=1
    ),
    
    # TensorBoard (opcional)
    TensorBoard(
        log_dir=f"{model_path}/logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),
        histogram_freq=0
    )
]

print("Callbacks configurados")

# Entrenar
print("\nIniciando entrenamiento...")
print(f"Epochs: 20 (con early stopping)")
print(f"Batch size: {BATCH_SIZE}")
print(f"Steps per epoch: {steps_per_epoch}")

import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
print("GPUs found:", gpus)
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except Exception as e:
        print("Could not set memory growth:", e)

# Ajustar batch size si GPU disponible
if gpus:
    BATCH_SIZE = BATCH_SIZE * 2  
    print(f"Adjusted batch size for GPU: {BATCH_SIZE}")

history = model.fit(
    train_dataset,
    epochs=20,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_dataset,
    validation_steps=validation_steps,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=1
)

print("\nEntrenamiento completado!")

In [0]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, precision_recall_curve

# Cargar mejor modelo
best_model = keras.models.load_model(f"{model_path}/best_model.keras")

# Leer test set
test_dataset = create_tf_dataset_from_parquet("/mnt/jc-analytics-databricks-work/analytics/gpereyra/pp_nn/churn3_tfdata/test", BATCH_SIZE, shuffle=False)

In [0]:

test_size = spark.table("teams.data_science.gp_pp_test_churn3_scaled").count()
test_steps = test_size // BATCH_SIZE

print("üìä Evaluando en test set...")
test_results = best_model.evaluate(test_dataset, steps=test_steps, verbose=1)

print("\nüìà M√©tricas en Test:")
for name, value in zip(best_model.metrics_names, test_results):
    print(f"{name}: {value:.4f}")

# Obtener predicciones (probabilidades)
print("\nüîÆ Generando predicciones...")
y_pred_proba = []
y_true = []

for x_batch, y_batch in test_dataset:
    preds = best_model.predict(x_batch, verbose=0)
    y_pred_proba.extend(preds.flatten())
    y_true.extend(y_batch.numpy())
    
    if len(y_true) >= test_size:
        break

y_pred_proba = np.array(y_pred_proba[:test_size])
y_true = np.array(y_true[:test_size])

print(f"‚úÖ Predicciones obtenidas: {len(y_pred_proba):,}")

# Evaluar con threshold default (0.5)
y_pred_default = (y_pred_proba >= 0.5).astype(int)

print("\nüìä Resultados con threshold=0.5:")
print(classification_report(y_true, y_pred_default, target_names=['No Churn', 'Churn']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred_default))

In [0]:
# üìà M√©tricas en Test:
# loss: 0.6047
# compile_metrics: 0.8058

# üîÆ Generando predicciones...
# ‚úÖ Predicciones obtenidas: 10,033

# üìä Resultados con threshold=0.5:
#               precision    recall  f1-score   support

#     No Churn       0.99      0.81      0.89      9701
#        Churn       0.13      0.82      0.22       332

#     accuracy                           0.81     10033
#    macro avg       0.56      0.81      0.55     10033
# weighted avg       0.96      0.81      0.87     10033


# Confusion Matrix:
# [[7828 1873]
#  [  60  272]]

In [0]:
# 10k
# ‚úÖ Predicciones obtenidas: 1,057

# üìä Resultados con threshold=0.5:
#               precision    recall  f1-score   support

#     No Churn       0.99      0.84      0.91      1013
#        Churn       0.18      0.84      0.30        44

#     accuracy                           0.84      1057
#    macro avg       0.59      0.84      0.60      1057
# weighted avg       0.96      0.84      0.88      1057


# Confusion Matrix:
# [[846 167]
#  [  7  37]]



# 100k
# üìà M√©tricas en Test:
# loss: 0.5824
# compile_metrics: 0.8002

# üîÆ Generando predicciones...
# ‚úÖ Predicciones obtenidas: 10,073

# üìä Resultados con threshold=0.5:
#               precision    recall  f1-score   support

#     No Churn       0.99      0.80      0.88      9730
#        Churn       0.12      0.81      0.22       343

#     accuracy                           0.80     10073
#    macro avg       0.56      0.81      0.55     10073
# weighted avg       0.96      0.80      0.86     10073


# onfusion Matrix:
# [[7764 1966]
#  [  64  279]]

# 100k -- red mas simple
# üìà M√©tricas en Test:
# loss: 0.5279
# compile_metrics: 0.8391

# üîÆ Generando predicciones...
# ‚úÖ Predicciones obtenidas: 10,073

# üìä Resultados con threshold=0.5:
#               precision    recall  f1-score   support

#     No Churn       0.99      0.84      0.91      9730
#        Churn       0.14      0.77      0.24       343

#     accuracy                           0.84     10073
#    macro avg       0.57      0.80      0.58     10073
# weighted avg       0.96      0.84      0.89     10073


# Confusion Matrix:
# [[8173 1557]
#  [  80  263]]

In [0]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, precision_recall_curve, f1_score

# Cargar mejor modelo
best_model = keras.models.load_model(f"{model_path}/best_model.keras")

# Leer test set
test_path = "/mnt/jc-analytics-databricks-work/analytics/gpereyra/pp_nn/churn3_tfdata/test"
test_dataset = create_tf_dataset_from_parquet(test_path, BATCH_SIZE, shuffle=False)

# Calcular size din√°micamente
test_size = spark.table("teams.data_science.gp_pp_test_churn3_scaled").count()
test_steps = test_size // BATCH_SIZE

print(f"Test size: {test_size:,}")

print("\nüìä Evaluando en test set...")
test_results = best_model.evaluate(test_dataset, steps=test_steps, verbose=1)

print("\nüìà M√©tricas en Test:")
for name, value in zip(best_model.metrics_names, test_results):
    print(f"{name}: {value:.4f}")

# Obtener predicciones
print("\nüîÆ Generando predicciones...")
y_pred_proba = []
y_true = []

for x_batch, y_batch in test_dataset:
    preds = best_model.predict(x_batch, verbose=0)
    y_pred_proba.extend(preds.flatten())
    y_true.extend(y_batch.numpy())
    
    if len(y_true) >= test_size:
        break

y_pred_proba = np.array(y_pred_proba[:test_size])
y_true = np.array(y_true[:test_size])

print(f"‚úÖ Predicciones obtenidas: {len(y_pred_proba):,}")
print(f"   Distribuci√≥n real - Churn: {y_true.sum():,} ({y_true.sum()/len(y_true)*100:.2f}%)")

# Evaluar con threshold default (0.5)
print("\n" + "="*60)
print("üìä RESULTADOS CON THRESHOLD = 0.5 (default)")
print("="*60)
y_pred_default = (y_pred_proba >= 0.5).astype(int)
print(classification_report(y_true, y_pred_default, target_names=['No Churn', 'Churn'], digits=4))

cm_default = confusion_matrix(y_true, y_pred_default)
print("Confusion Matrix:")
print(cm_default)
print(f"\nFalsos Negativos (churners perdidos): {cm_default[1,0]:,}")
print(f"Verdaderos Positivos (churners detectados): {cm_default[1,1]:,}")

# Encontrar threshold √≥ptimo seg√∫n diferentes criterios
print("\n" + "="*60)
print("üîç B√öSQUEDA DE THRESHOLD √ìPTIMO")
print("="*60)

# 1. Threshold que maximiza F1-Score
thresholds_to_test = np.arange(0.1, 0.9, 0.05)
f1_scores = []

for thresh in thresholds_to_test:
    y_pred_temp = (y_pred_proba >= thresh).astype(int)
    f1 = f1_score(y_true, y_pred_temp)
    f1_scores.append(f1)

best_f1_idx = np.argmax(f1_scores)
best_f1_threshold = thresholds_to_test[best_f1_idx]

print(f"\n1Ô∏è‚É£ Threshold que maximiza F1-Score: {best_f1_threshold:.2f}")
print(f"   F1-Score: {f1_scores[best_f1_idx]:.4f}")

# # 2. Curva Precision-Recall para decidir seg√∫n negocio
# precision, recall, pr_thresholds = precision_recall_curve(y_true, y_pred_proba)

# # Encontrar threshold con Recall >= 0.70 (capturar 70% de churners)
# target_recall = 0.70
# idx_recall_70 = np.argmin(np.abs(recall - target_recall))
# threshold_recall_70 = pr_thresholds[idx_recall_70] if idx_recall_70 < len(pr_thresholds) else 0.5



print(f"\n2Ô∏è‚É£ Threshold para Recall ‚â• {target_recall:.0%}: {threshold_recall_70:.3f}")
print(f"   Recall logrado: {recall[idx_recall_70]:.4f}")
print(f"   Precision: {precision[idx_recall_70]:.4f}")

# 3. Encontrar threshold con Precision >= 0.15 (15% de aciertos en predicciones)
target_precision = 0.15
idx_prec_15 = np.argmin(np.abs(precision - target_precision))
threshold_prec_15 = pr_thresholds[idx_prec_15] if idx_prec_15 < len(pr_thresholds) else 0.5

print(f"\n3Ô∏è‚É£ Threshold para Precision ‚â• {target_precision:.0%}: {threshold_prec_15:.3f}")
print(f"   Precision lograda: {precision[idx_prec_15]:.4f}")
print(f"   Recall: {recall[idx_prec_15]:.4f}")

# Evaluaci√≥n detallada con threshold √≥ptimo F1
print("\n" + "="*60)
print(f"üìä RESULTADOS CON THRESHOLD √ìPTIMO = {best_f1_threshold:.2f}")
print("="*60)
y_pred_optimal = (y_pred_proba >= best_f1_threshold).astype(int)
print(classification_report(y_true, y_pred_optimal, target_names=['No Churn', 'Churn'], digits=4))

cm_optimal = confusion_matrix(y_true, y_pred_optimal)
print("Confusion Matrix:")
print(cm_optimal)
print(f"\nFalsos Negativos (churners perdidos): {cm_optimal[1,0]:,}")
print(f"Verdaderos Positivos (churners detectados): {cm_optimal[1,1]:,}")
print(f"Mejora en detecci√≥n: {(cm_optimal[1,1] - cm_default[1,1]):,} churners m√°s detectados")

In [0]:
cm_optimal = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm_optimal)
print(f"\nFalsos Negativos (churners perdidos): {cm_optimal[1,0]:,}")
print(f"Verdaderos Positivos (churners detectados): {cm_optimal[1,1]:,}")
print(f"Mejora en detecci√≥n: {(cm_optimal[1,1] - cm_default[1,1]):,} churners m√°s detectados")

In [0]:
plt.figure(figsize=(8,6))
plt.plot(pr_thresholds, recall[:-1], label='Recall', color='green')
plt.plot(pr_thresholds, precision[:-1], label='Precision', color='orange')
plt.axvline(0.85, color='gray', linestyle='--', label='Threshold 0.85')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision y Recall seg√∫n Threshold')
plt.legend()
plt.grid(True)
plt.show()