In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
import mlflow
import mlflow.spark

spark = (SparkSession.builder
    .appName("SECOP_MLflow_Tracking")
    .master("spark://spark-master:7077")
    .config("spark.executor.memory", "1g")
    .config("spark.executor.cores", "1")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")
print("Spark Version:", spark.version)
print("Spark Master:", spark.sparkContext.master)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/15 03:03:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/15 03:03:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark Version: 3.5.0
Spark Master: spark://spark-master:7077


26/02/15 03:03:28 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [2]:
mlflow.set_tracking_uri("http://mlflow:5000")

In [3]:
experiment_name="secop_prediccion"
mlflow.set_experiment(experiment_name)

2026/02/15 03:03:53 INFO mlflow.tracking.fluent: Experiment with name 'secop_prediccion' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///opt/mlflow/mlruns/720835341887638586', creation_time=1771124633473, experiment_id='720835341887638586', last_update_time=1771124633473, lifecycle_stage='active', name='secop_prediccion', tags={}>

In [4]:
df = spark.read.parquet("/opt/spark-data/processed/secop_features_q4_2025.parquet") \
    .select("label", "features") \
    .filter(col("label").isNotNull())

# DEV 
DEV_N = 10000
df_dev = df.limit(DEV_N)

train, test = df_dev.randomSplit([0.8, 0.2], seed=42)
print("DEV:", df_dev.count(), "Train:", train.count(), "Test:", test.count())


26/02/15 03:05:09 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
26/02/15 03:05:24 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
                                                                                

DEV: 10000 Train: 8079 Test: 1921


                                                                                

In [5]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Modelo baseline
lr = LinearRegression(
    featuresCol="features",
    labelCol="label",
    predictionCol="prediction",
    maxIter=50,
    regParam=0.1,
    elasticNetParam=0.0  # Ridge
)

e_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
e_mae  = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
e_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

with mlflow.start_run(run_name="baseline_lr_dev"):

    # Log de parámetros
    mlflow.log_param("DEV_N", DEV_N)
    mlflow.log_param("split_seed", 42)
    mlflow.log_param("train_count", train.count())
    mlflow.log_param("test_count", test.count())

    mlflow.log_param("model", "LinearRegression")
    mlflow.log_param("maxIter", lr.getMaxIter())
    mlflow.log_param("regParam", lr.getRegParam())
    mlflow.log_param("elasticNetParam", lr.getElasticNetParam())

    # Entrenar
    lr_model = lr.fit(train)

    # Predecir y métricas
    pred = lr_model.transform(test)

    rmse = e_rmse.evaluate(pred)
    mae  = e_mae.evaluate(pred)
    r2   = e_r2.evaluate(pred)

    print("RMSE:", rmse)
    print("MAE :", mae)
    print("R2  :", r2)

    # Log de métricas
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # Guardar el modelo como artefacto (Spark model)
    mlflow.spark.log_model(lr_model, artifact_path="model")

    # Mostrar una muestra rápida
    pred.select("label", "prediction").show(10, truncate=False)

print("✅ Run registrado en MLflow")


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

26/02/15 03:10:23 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
26/02/15 03:10:23 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
26/02/15 03:10:23 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                             

RMSE: 1.6528961728056981
MAE : 0.7527625593240994
R2  : 0.17901077541197752




+-----+------------------+
|label|prediction        |
+-----+------------------+
|0.0  |17.06851142316909 |
|0.0  |17.551477227321904|
|0.0  |17.462276295238695|
|0.0  |15.628315001467271|
|0.0  |14.703777126466957|
|0.0  |17.624692244205548|
|0.0  |14.470992722958641|
|0.0  |16.32400850080554 |
|0.0  |13.141104909111553|
|0.0  |18.938634678105245|
+-----+------------------+
only showing top 10 rows

✅ Run registrado en MLflow


In [6]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

e_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
e_mae  = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
e_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

configs = [
    ("ridge_dev",       0.10, 0.0),  # Ridge
    ("lasso_dev",       0.10, 1.0),  # Lasso
    ("elasticnet_dev",  0.01, 0.5),  # ElasticNet
]

for run_name, regParam, elasticNetParam in configs:

    lr = LinearRegression(
        featuresCol="features",
        labelCol="label",
        predictionCol="prediction",
        maxIter=100,
        regParam=regParam,
        elasticNetParam=elasticNetParam
    )

    with mlflow.start_run(run_name=run_name):

        # params
        mlflow.log_param("DEV_N", DEV_N)
        mlflow.log_param("split_seed", 42)
        mlflow.log_param("train_count", train.count())
        mlflow.log_param("test_count", test.count())

        mlflow.log_param("model", "LinearRegression")
        mlflow.log_param("maxIter", lr.getMaxIter())
        mlflow.log_param("regParam", lr.getRegParam())
        mlflow.log_param("elasticNetParam", lr.getElasticNetParam())

        # fit + predict
        model = lr.fit(train)
        pred = model.transform(test)

        rmse = e_rmse.evaluate(pred)
        mae  = e_mae.evaluate(pred)
        r2   = e_r2.evaluate(pred)

        # metrics
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        print(f"\n=== {run_name} ===")
        print("RMSE:", rmse)
        print("MAE :", mae)
        print("R2  :", r2)

print("\n✅ Modelos comparables registrados en MLflow")


                                                                                


=== ridge_dev ===
RMSE: 1.6528961728056981
MAE : 0.7527625593240994
R2  : 0.2464019632018094

=== lasso_dev ===
RMSE: 1.7264938945381962
MAE : 0.7519771349700406
R2  : 0.17779766845005485

=== elasticnet_dev ===
RMSE: 1.5354458623810536
MAE : 0.7328499019971944
R2  : 0.24457162285380762

✅ Modelos comparables registrados en MLflow


In [7]:
# === FINAL FULL con el mejor modelo (ElasticNet) ===
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

e_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
e_mae  = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
e_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

print("Leyendo FULL features...")
df_full = spark.read.parquet("/opt/spark-data/processed/secop_features_q4_2025.parquet") \
    .select("label", "features") \
    .filter(col("label").isNotNull())

n_full = df_full.count()
print("FULL registros:", n_full)

train_full, test_full = df_full.randomSplit([0.8, 0.2], seed=42)
print("Train FULL:", train_full.count(), "Test FULL:", test_full.count())

# Mejor configuración según DEV (ElasticNet)
best_regParam = 0.01
best_enet = 0.5

lr_final = LinearRegression(
    featuresCol="features",
    labelCol="label",
    predictionCol="prediction",
    maxIter=100,
    regParam=best_regParam,
    elasticNetParam=best_enet
)

with mlflow.start_run(run_name="final_full_elasticnet"):

    mlflow.log_param("mode", "FULL")
    mlflow.log_param("full_count", n_full)
    mlflow.log_param("split_seed", 42)
    mlflow.log_param("train_count", train_full.count())
    mlflow.log_param("test_count", test_full.count())

    mlflow.log_param("model", "LinearRegression")
    mlflow.log_param("maxIter", lr_final.getMaxIter())
    mlflow.log_param("regParam", lr_final.getRegParam())
    mlflow.log_param("elasticNetParam", lr_final.getElasticNetParam())

    print("\nEntrenando modelo FINAL (FULL)...")
    model_full = lr_final.fit(train_full)

    pred_full = model_full.transform(test_full)

    rmse_full = e_rmse.evaluate(pred_full)
    mae_full  = e_mae.evaluate(pred_full)
    r2_full   = e_r2.evaluate(pred_full)

    mlflow.log_metric("rmse", rmse_full)
    mlflow.log_metric("mae", mae_full)
    mlflow.log_metric("r2", r2_full)

    print("\n=== MÉTRICAS FINAL FULL (ElasticNet) ===")
    print("RMSE:", rmse_full)
    print("MAE :", mae_full)
    print("R2  :", r2_full)

    # Guardar modelo (artifact)
    mlflow.spark.log_model(model_full, artifact_path="model")

print("\n✅ Run FINAL FULL registrado en MLflow")


Leyendo FULL features...


                                                                                

FULL registros: 59125


                                                                                

Train FULL: 47486 Test FULL: 11639

Entrenando modelo FINAL (FULL)...


                                                                                


=== MÉTRICAS FINAL FULL (ElasticNet) ===
RMSE: 1.4640181311652123
MAE : 0.7391346100035858
R2  : 0.33908001080560213





✅ Run FINAL FULL registrado en MLflow


In [7]:
### Cargar modelos registrados ####

import mlflow.spark
from pyspark.sql import SparkSession

# 1. Definir la ruta usando el nombre que elegiste
# 'models:/' le dice a MLflow que busque en el registro oficial, no en una carpeta
model_name = "elastic"
model_version = "latest" # O puedes poner "1", "2", etc.
model_uri = f"models:/{model_name}/{model_version}"

# 2. Cargar el modelo como un objeto de Spark ML
print(f"Cargando modelo '{model_name}' desde el registro...")
loaded_model = mlflow.spark.load_model(model_uri)

print("✓ Modelo cargado exitosamente")

2026/01/30 01:48:07 INFO mlflow.spark: 'models:/elastic/latest' resolved as 'file:///opt/mlflow/mlruns/402490040680732574/42e2221c257d4694a5621b582fd62aa1/artifacts/model'


Cargando modelo 'elastic' desde el registro...


2026/01/30 01:48:07 INFO mlflow.spark: URI 'models:/elastic/latest/sparkml' does not point to the current DFS.
2026/01/30 01:48:07 INFO mlflow.spark: File 'models:/elastic/latest/sparkml' not found on DFS. Will attempt to upload the file.


✓ Modelo cargado exitosamente


In [8]:
# 3. Supongamos que 'df_nuevos_contratos' son datos que acaban de llegar
# (Deben haber pasado por el mismo Pipeline de Scaler y PCA)
predictions = loaded_model.transform(df)

# 4. Mostrar los resultados
predictions.select("features", "prediction").show(5)

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|[1.05511824914886...|1.579372016765363E11|
|[2.89965949085992...|-9.83453525732329E10|
|[1.91826045058565...|  8.51104667218737E9|
|[1.91000751697583...|3.685598427820685...|
|[1.91815735697773...|  8.53426230649346E9|
+--------------------+--------------------+
only showing top 5 rows



In [9]:
spark.stop()

In [18]:
import mlflow
from mlflow.tracking import MlflowClient

# 1. Asegúrate de configurar la URI primero
mlflow.set_tracking_uri("http://mlflow:5000")
client = MlflowClient()

# 2. Listar experimentos para verificar el nombre real
print("Experimentos disponibles en el servidor:")
for exp in client.search_experiments():
    print(f" - {exp.name}")

# 3. Intentar obtener el experimento con el nombre correcto
experiment_name = "secop_prediccion" # Verifica si coincide con la lista de arriba
experiment = client.get_experiment_by_name(experiment_name)

if experiment is None:
    raise ValueError(f"No se encontró el experimento '{experiment_name}'. Revisa la lista de arriba.")

# 4. Si existe, buscar los runs
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.rmse ASC"],
    max_results=1
)

if not runs:
    raise ValueError(f"El experimento '{experiment_name}' existe pero no tiene ninguna ejecución (runs).")

best_run = runs[0]
best_run_id = best_run.info.run_id
print(f"✓ Éxito. Mejor Run ID: {best_run_id}")

Experimentos disponibles en el servidor:
 - secop_prediccion
✓ Éxito. Mejor Run ID: 42e2221c257d4694a5621b582fd62aa1


In [19]:
import mlflow

# 1. Configuración de acceso (asegúrate de que la URI sea la correcta)
mlflow.set_tracking_uri("http://mlflow:5000")

# 2. Ruta del modelo usando el ID que ya identificamos
# (Asegúrate de que 'best_run_id' esté definido en tu sesión actual)
model_uri = f"runs:/{best_run_id}/model"

# 3. Registrar con el nombre 'mejor'
# Si ya existía uno llamado 'mejor', creará la Versión 2, 3, etc.
model_details = mlflow.register_model(model_uri, "mejor")

print("-" * 30)
print(f"MODELO REGISTRADO COMO: mejor")
print("-" * 30)
print(f"Versión: {model_details.version}")
print(f"Estado: {model_details.current_stage}")

Successfully registered model 'mejor'.
2026/01/30 01:54:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: mejor, version 1


------------------------------
MODELO REGISTRADO COMO: mejor
------------------------------
Versión: 1
Estado: None


Created version '1' of model 'mejor'.


In [9]:
print("MLflow Tracking URI:", mlflow.get_tracking_uri())
print("MLflow UI: http://localhost:5000")

exp = mlflow.get_experiment_by_name("secop_prediccion")
print("\nExperiment:", exp.name)
print("Experiment ID:", exp.experiment_id)

runs_df = mlflow.search_runs(
    experiment_ids=[exp.experiment_id],
    order_by=["metrics.r2 DESC"],
    max_results=20
)

cols = ["run_id", "status", "start_time",
        "params.mode", "params.model", "params.regParam", "params.elasticNetParam",
        "metrics.rmse", "metrics.mae", "metrics.r2"]

show_df = runs_df[cols].copy()
display(show_df)

best = runs_df.sort_values("metrics.r2", ascending=False).iloc[0]

MLflow Tracking URI: http://mlflow:5000
MLflow UI: http://localhost:5000

Experiment: secop_prediccion
Experiment ID: 720835341887638586


Unnamed: 0,run_id,status,start_time,params.mode,params.model,params.regParam,params.elasticNetParam,metrics.rmse,metrics.mae,metrics.r2
0,4deed2d5fdff456b81d9a39393edf834,FINISHED,2026-02-15 03:22:19.973000+00:00,FULL,LinearRegression,0.01,0.5,1.464018,0.739135,0.33908
1,191f1f4e70db4a989221cc37b9e9f5be,FINISHED,2026-02-15 03:14:27.122000+00:00,,LinearRegression,0.1,0.0,1.652896,0.752763,0.246402
2,20788ff55dd14348a1c0a743e73a2e01,FINISHED,2026-02-15 03:14:43.442000+00:00,,LinearRegression,0.01,0.5,1.535446,0.73285,0.244572
3,bab68cb3c7df473b9daed96f9f01ec28,FINISHED,2026-02-15 03:10:14.290000+00:00,,LinearRegression,0.1,0.0,1.652896,0.752763,0.179011
4,7159e8b2f26a4127b0334a818d811be5,FINISHED,2026-02-15 03:14:37.170000+00:00,,LinearRegression,0.1,1.0,1.726494,0.751977,0.177798


### Resultados de Regresión – Comparación en MLflow

Durante esta fase se entrenaron modelos lineales regularizados (Ridge, Lasso y ElasticNet) y cada corrida fue registrada en MLflow con sus respectivos parámetros y métricas.

Siguiendo la estrategia recomendada en clase, primero se trabajó en modo **DEV** (subconjunto) para validar estabilidad y luego se ejecutó el entrenamiento **FULL** con el mejor enfoque identificado.

El mejor modelo registrado según R² fue:

- **Run ID:** final_full_elasticnet
- **Modo:** FULL  
- **regParam:** 0.01  
- **elasticNetParam:** 0.5  
- **RMSE:** 1.4640  
- **MAE:** 0.7391  
- **R²:** 0.3391  

Desde una perspectiva estadística, el R² indica que el modelo logra explicar aproximadamente el 34% de la variabilidad del logaritmo del valor del contrato. El RMSE, al penalizar más los errores grandes, es coherente con la posible presencia de contratos con valores atípicos. El MAE refleja el error promedio absoluto, mostrando estabilidad en las predicciones.

En conjunto, estos resultados evidencian un modelo lineal regularizado estable, reproducible y correctamente versionado mediante MLflow, cumpliendo buenas prácticas de experimentación y trazabilidad.

In [10]:
spark.stop()
print("SparkSession finalizada")

SparkSession finalizada
