In [11]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
import mlflow
import mlflow.spark

# %%
spark = SparkSession.builder \
    .appName("SECOP_MLflow") \
    .master("local[*]") \
    .getOrCreate()

26/01/30 01:49:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [12]:
mlflow.set_tracking_uri("http://mlflow:5000")

In [13]:
experiment_name="secop_prediccion"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='file:///opt/mlflow/mlruns/402490040680732574', creation_time=1769736231544, experiment_id='402490040680732574', last_update_time=1769736231544, lifecycle_stage='active', name='secop_prediccion', tags={}>

In [3]:
df = spark.read.parquet("/opt/spark-data/processed/secop_ml_ready.parquet")
df = df.withColumnRenamed("valor_del_contrato_num", "label") \
       .withColumnRenamed("features_pca", "features") \
       .filter(col("label").isNotNull())

train, test = df.randomSplit([0.8, 0.2], seed=42)

print(f"Train: {train.count():,}")
print(f"Test: {test.count():,}")

# %%
evaluator_rmse = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="rmse"
)

evaluator_mae = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="mae"
)
evaluator_r2 = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="r2"
)

                                                                                

Train: 838
Test: 162


In [10]:
print("Experimento 1")
with mlflow.start_run(run_name="baseline_model"):
    # definamos parametros
    reg_param = 0.0
    elastic_param = 0.0
    max_iter = 100
    
    mlflow.log_param("regParam", reg_param)
    mlflow.log_param("elasticParam", elastic_param)
    mlflow.log_param("maxIaram", max_iter)
    
    lr= LinearRegression(
        featuresCol="features",
        labelCol="label",
        regParam=reg_param,
        elasticNetParam=elastic_param,
        maxIter=max_iter
    )
    
    model = lr.fit(train)
    
    predictions = model.transform(test)
    
    # Evaluar
    rmse = evaluator_rmse.evaluate(predictions)
    mae = evaluator_mae.evaluate(predictions)
    r2 = evaluator_r2.evaluate(predictions)

    # Log de métricas
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    # Guardar modelo
    mlflow.spark.log_model(model, "model")

    

    print(f"✓ RMSE: ${rmse:,.2f}")
    print(f"✓ MAE: ${mae:,.2f}")
    print(f"✓ R²: {r2:.4f}")


# %%
# =====================================
# EXPERIMENTO 2: Ridge (L2)
# =====================================

print("\n=== EXPERIMENTO 2: Ridge Regression (L2) ===")

with mlflow.start_run(run_name="ridge_l2_regression"):
    reg_param = 0.1
    elastic_param = 0.0  # L2 pure

    mlflow.log_param("regParam", reg_param)
    mlflow.log_param("elasticNetParam", elastic_param)
    mlflow.log_param("maxIter", 100)
    mlflow.log_param("model_type", "Ridge")

    lr = LinearRegression(
        featuresCol="features",
        labelCol="label",
        regParam=reg_param,
        elasticNetParam=elastic_param,
        maxIter=100
    )

    model = lr.fit(train)
    predictions = model.transform(test)

    rmse = evaluator.evaluate(predictions)
    mae = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae").evaluate(predictions)
    r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2").evaluate(predictions)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.spark.log_model(model, "model")

    print(f"✓ RMSE: ${rmse:,.2f}")

# %%
# =====================================
# EXPERIMENTO 3: Lasso (L1)
# =====================================

print("\n=== EXPERIMENTO 3: Lasso Regression (L1) ===")

with mlflow.start_run(run_name="lasso_l1_regression"):
    reg_param = 0.1
    elastic_param = 1.0  # L1 pure

    mlflow.log_param("regParam", reg_param)
    mlflow.log_param("elasticNetParam", elastic_param)
    mlflow.log_param("maxIter", 100)
    mlflow.log_param("model_type", "Lasso")

    lr = LinearRegression(
        featuresCol="features",
        labelCol="label",
        regParam=reg_param,
        elasticNetParam=elastic_param,
        maxIter=100
    )

    model = lr.fit(train)
    predictions = model.transform(test)

    rmse = evaluator.evaluate(predictions)
    mae = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae").evaluate(predictions)
    r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2").evaluate(predictions)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.spark.log_model(model, "model")

    print(f"✓ RMSE: ${rmse:,.2f}")

# %%
# =====================================
# EXPERIMENTO 4: ElasticNet
# =====================================

print("\n=== EXPERIMENTO 4: ElasticNet (L1 + L2) ===")

with mlflow.start_run(run_name="elasticnet_l1_l2"):
    reg_param = 0.1
    elastic_param = 0.5  # Mezcla 50/50

    mlflow.log_param("regParam", reg_param)
    mlflow.log_param("elasticNetParam", elastic_param)
    mlflow.log_param("maxIter", 100)
    mlflow.log_param("model_type", "ElasticNet")

    lr = LinearRegression(
        featuresCol="features",
        labelCol="label",
        regParam=reg_param,
        elasticNetParam=elastic_param,
        maxIter=100
    )

    model = lr.fit(train)
    predictions = model.transform(test)

    rmse = evaluator.evaluate(predictions)
    mae = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae").evaluate(predictions)
    r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2").evaluate(predictions)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.spark.log_model(model, "model")

    print(f"✓ RMSE: ${rmse:,.2f}")

# %%
print("\n" + "="*60)
print("EXPERIMENTOS COMPLETADOS")
print("="*60)
print("✓ 4 experimentos registrados en MLflow")
print(f"✓ Accede a MLflow UI: http://localhost:5000")
print(f"✓ Experimento: {experiment_name}")
print("="*60)

# %%
spark.stop()



Experimento 1


26/01/30 01:38:26 WARN Instrumentation: [9ac2861f] regParam is zero, which might cause numerical instability and overfitting.


✓ RMSE: $29,056,073,785.67
✓ MAE: $16,319,057,754.43
✓ R²: -8333.6628

=== EXPERIMENTO 2: Ridge Regression (L2) ===
✓ RMSE: $29,056,073,785.62

=== EXPERIMENTO 3: Lasso Regression (L1) ===
✓ RMSE: $29,056,074,863.70

=== EXPERIMENTO 4: ElasticNet (L1 + L2) ===
✓ RMSE: $29,056,072,667.81

EXPERIMENTOS COMPLETADOS
✓ 4 experimentos registrados en MLflow
✓ Accede a MLflow UI: http://localhost:5000
✓ Experimento: secop_prediccion


In [7]:
### Cargar modelos registrados ####

import mlflow.spark
from pyspark.sql import SparkSession

# 1. Definir la ruta usando el nombre que elegiste
# 'models:/' le dice a MLflow que busque en el registro oficial, no en una carpeta
model_name = "elastic"
model_version = "latest" # O puedes poner "1", "2", etc.
model_uri = f"models:/{model_name}/{model_version}"

# 2. Cargar el modelo como un objeto de Spark ML
print(f"Cargando modelo '{model_name}' desde el registro...")
loaded_model = mlflow.spark.load_model(model_uri)

print("✓ Modelo cargado exitosamente")

2026/01/30 01:48:07 INFO mlflow.spark: 'models:/elastic/latest' resolved as 'file:///opt/mlflow/mlruns/402490040680732574/42e2221c257d4694a5621b582fd62aa1/artifacts/model'


Cargando modelo 'elastic' desde el registro...


2026/01/30 01:48:07 INFO mlflow.spark: URI 'models:/elastic/latest/sparkml' does not point to the current DFS.
2026/01/30 01:48:07 INFO mlflow.spark: File 'models:/elastic/latest/sparkml' not found on DFS. Will attempt to upload the file.


✓ Modelo cargado exitosamente


In [8]:
# 3. Supongamos que 'df_nuevos_contratos' son datos que acaban de llegar
# (Deben haber pasado por el mismo Pipeline de Scaler y PCA)
predictions = loaded_model.transform(df)

# 4. Mostrar los resultados
predictions.select("features", "prediction").show(5)

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|[1.05511824914886...|1.579372016765363E11|
|[2.89965949085992...|-9.83453525732329E10|
|[1.91826045058565...|  8.51104667218737E9|
|[1.91000751697583...|3.685598427820685...|
|[1.91815735697773...|  8.53426230649346E9|
+--------------------+--------------------+
only showing top 5 rows



In [9]:
spark.stop()

In [18]:
import mlflow
from mlflow.tracking import MlflowClient

# 1. Asegúrate de configurar la URI primero
mlflow.set_tracking_uri("http://mlflow:5000")
client = MlflowClient()

# 2. Listar experimentos para verificar el nombre real
print("Experimentos disponibles en el servidor:")
for exp in client.search_experiments():
    print(f" - {exp.name}")

# 3. Intentar obtener el experimento con el nombre correcto
experiment_name = "secop_prediccion" # Verifica si coincide con la lista de arriba
experiment = client.get_experiment_by_name(experiment_name)

if experiment is None:
    raise ValueError(f"No se encontró el experimento '{experiment_name}'. Revisa la lista de arriba.")

# 4. Si existe, buscar los runs
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.rmse ASC"],
    max_results=1
)

if not runs:
    raise ValueError(f"El experimento '{experiment_name}' existe pero no tiene ninguna ejecución (runs).")

best_run = runs[0]
best_run_id = best_run.info.run_id
print(f"✓ Éxito. Mejor Run ID: {best_run_id}")

Experimentos disponibles en el servidor:
 - secop_prediccion
✓ Éxito. Mejor Run ID: 42e2221c257d4694a5621b582fd62aa1


In [19]:
import mlflow

# 1. Configuración de acceso (asegúrate de que la URI sea la correcta)
mlflow.set_tracking_uri("http://mlflow:5000")

# 2. Ruta del modelo usando el ID que ya identificamos
# (Asegúrate de que 'best_run_id' esté definido en tu sesión actual)
model_uri = f"runs:/{best_run_id}/model"

# 3. Registrar con el nombre 'mejor'
# Si ya existía uno llamado 'mejor', creará la Versión 2, 3, etc.
model_details = mlflow.register_model(model_uri, "mejor")

print("-" * 30)
print(f"MODELO REGISTRADO COMO: mejor")
print("-" * 30)
print(f"Versión: {model_details.version}")
print(f"Estado: {model_details.current_stage}")

Successfully registered model 'mejor'.
2026/01/30 01:54:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: mejor, version 1


------------------------------
MODELO REGISTRADO COMO: mejor
------------------------------
Versión: 1
Estado: None


Created version '1' of model 'mejor'.
