1. Carga del dataset desde CSV
2. Exploración y limpieza inicial
3. Transformaciones
4. Preparación de datos para ML (indexado y ensamblado)
5. Separación en train/test
6. Entrenamiento del modelo
7. Evaluación
8. (Opcional) Guardado del modelo o predicciones


In [0]:
# 1. Creamos el DataFrame desde datos simulados
data = [
    (1, "mujer", 25, 32000, "España", 0),
    (2, "hombre", 45, 57000, "México", 1),
    (3, "mujer", 31, 43000, "España", 0),
    (4, "hombre", 29, 25000, "Chile", 0),
    (5, "mujer", 38, 72000, "México", 1),
    (6, "hombre", 35, 50000, "Argentina", 0),
    (7, "mujer", 22, 28000, "Chile", 1),
    (8, "hombre", 39, 61000, "España", 0),
    (9, "mujer", 33, 46000, "México", 1),
    (10, "hombre", 28, 34000, "Argentina", 0)
]

columnas = ["id_cliente", "genero", "edad", "ingresos", "pais", "churn"]
df = spark.createDataFrame(data, columnas)
df.show()

+----------+------+----+--------+---------+-----+
|id_cliente|genero|edad|ingresos|     pais|churn|
+----------+------+----+--------+---------+-----+
|         1| mujer|  25|   32000|   España|    0|
|         2|hombre|  45|   57000|   México|    1|
|         3| mujer|  31|   43000|   España|    0|
|         4|hombre|  29|   25000|    Chile|    0|
|         5| mujer|  38|   72000|   México|    1|
|         6|hombre|  35|   50000|Argentina|    0|
|         7| mujer|  22|   28000|    Chile|    1|
|         8|hombre|  39|   61000|   España|    0|
|         9| mujer|  33|   46000|   México|    1|
|        10|hombre|  28|   34000|Argentina|    0|
+----------+------+----+--------+---------+-----+



In [0]:
# 2. Inspeccionar el esquema

df.printSchema()


root
 |-- id_cliente: long (nullable = true)
 |-- genero: string (nullable = true)
 |-- edad: long (nullable = true)
 |-- ingresos: long (nullable = true)
 |-- pais: string (nullable = true)
 |-- churn: long (nullable = true)



In [0]:
# 3. Indexar columnas categóricas
# Usaremos StringIndexer para genero y pais.

from pyspark.ml.feature import StringIndexer

index_genero = StringIndexer(inputCol="genero", outputCol="genero_idx")
index_pais = StringIndexer(inputCol="pais", outputCol="pais_idx")

df_indexado = index_genero.fit(df).transform(df)
df_indexado = index_pais.fit(df_indexado).transform(df_indexado)
df_indexado.select("genero", "genero_idx", "pais", "pais_idx").show()

+------+----------+---------+--------+
|genero|genero_idx|     pais|pais_idx|
+------+----------+---------+--------+
| mujer|       1.0|   España|     0.0|
|hombre|       0.0|   México|     1.0|
| mujer|       1.0|   España|     0.0|
|hombre|       0.0|    Chile|     3.0|
| mujer|       1.0|   México|     1.0|
|hombre|       0.0|Argentina|     2.0|
| mujer|       1.0|    Chile|     3.0|
|hombre|       0.0|   España|     0.0|
| mujer|       1.0|   México|     1.0|
|hombre|       0.0|Argentina|     2.0|
+------+----------+---------+--------+



In [0]:
# 4. VectorAssembler para preparar las features

from pyspark.ml.feature import VectorAssembler

ensamblador = VectorAssembler(
    inputCols=["genero_idx", "pais_idx", "edad", "ingresos"],
    outputCol="features"
)
df_final = ensamblador.transform(df_indexado)
df_final.select("features", "churn").show(truncate=False)

+----------------------+-----+
|features              |churn|
+----------------------+-----+
|[1.0,0.0,25.0,32000.0]|0    |
|[0.0,1.0,45.0,57000.0]|1    |
|[1.0,0.0,31.0,43000.0]|0    |
|[0.0,3.0,29.0,25000.0]|0    |
|[1.0,1.0,38.0,72000.0]|1    |
|[0.0,2.0,35.0,50000.0]|0    |
|[1.0,3.0,22.0,28000.0]|1    |
|[0.0,0.0,39.0,61000.0]|0    |
|[1.0,1.0,33.0,46000.0]|1    |
|[0.0,2.0,28.0,34000.0]|0    |
+----------------------+-----+



In [0]:
# 5. Separar en train/test

df_train, df_test = df_final.randomSplit([0.7, 0.3], seed=42)


In [0]:
# 6. Entrenar modelo

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="churn")
modelo = lr.fit(df_train)


In [0]:
# 7. Predecir y evaluar

predicciones = modelo.transform(df_test)
predicciones.select("churn", "prediction", "probability").show(truncate=False)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluador = BinaryClassificationEvaluator(labelCol="churn", rawPredictionCol="rawPrediction")
roc_auc = evaluador.evaluate(predicciones)
print("Área bajo la curva ROC:", roc_auc)


+-----+----------+-------------------------------------------+
|churn|prediction|probability                                |
+-----+----------+-------------------------------------------+
|1    |0.0       |[0.9999910380582799,8.961941720064814E-6]  |
|0    |0.0       |[0.918814419110038,0.08118558088996197]    |
|0    |0.0       |[0.9999999998566282,1.4337175890943854E-10]|
+-----+----------+-------------------------------------------+

Área bajo la curva ROC: 0.5


In [0]:
# 8. (Opcional) Guardar predicciones

predicciones.select("id_cliente", "prediction").write.mode("overwrite").csv("/tmp/predicciones_clientes.csv", header=True)
