In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# %%
spark = SparkSession.builder \
    .appName("SECOP_RegresionLineal") \
    .master("local[*]") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/30 00:23:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/30 00:23:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/01/30 00:23:59 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [3]:
# Cargar datos
df = spark.read.parquet("/opt/spark-data/processed/secop_ml_ready.parquet")

# Renombrar columnas para consistencia
df = df.withColumnRenamed("valor_del_contrato_num", "label") \
       .withColumnRenamed("features_pca", "features")

# Filtrar valores nulos
df = df.filter(col("label").isNotNull())
print(f"Registros: {df.count():,}")
print(f"Columnas: {len(df.columns)}")

Registros: 1,000
Columnas: 2


In [12]:
train, test = df.randomSplit([0.7,0.3], seed = 45)

print(train.count())
print(test.count())

719
281


In [13]:
lr = LinearRegression(featuresCol="features",
                     labelCol="label",
                     maxIter=100,
                     regParam=0.0,
                     elasticNetParam=0.0)

In [14]:
lr_model=lr.fit(train)

26/01/30 00:34:16 WARN Instrumentation: [7cf1441d] regParam is zero, which might cause numerical instability and overfitting.
26/01/30 00:34:17 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
26/01/30 00:34:17 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
26/01/30 00:34:17 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [18]:
lr_model.getSolver()

'auto'

In [19]:
lr_model.summary.rootMeanSquaredError

800092203.0977905

In [20]:
lr_model.summary.r2

0.039694832196779495

In [21]:
predictions= lr_model.transform(test)

In [24]:
evaluator_rmse = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
evaluator_mae = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="mae")

evaluator_r2 = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print("="*60)
print("rmse:", rmse)
print("mae:",mae)
print("r2:",r2)

rmse: 113311038563.22714
mae: 6924231639.562357
r2: -0.003060618702814377


In [25]:
# Guardar modelo
model_path = "/opt/spark-data/processed/linear_regression_model"
lr_model.save(model_path)
print(f"\nModelo guardado en: {model_path}")

# %%
# Guardar predicciones
predictions_path = "/opt/spark-data/processed/predictions_lr.parquet"
predictions.write.mode("overwrite").parquet(predictions_path)
print(f"Predicciones guardadas en: {predictions_path}")

# %%
spark.stop()


Modelo guardado en: /opt/spark-data/processed/linear_regression_model


26/01/30 00:55:16 WARN FileUtil: Failed to delete file or dir [/opt/spark-data/processed/predictions_lr.parquet/_temporary/0]: it still exists.


Predicciones guardadas en: /opt/spark-data/processed/predictions_lr.parquet
