In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
)
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, when, isnull

# %%
# Configurar SparkSession
spark = SparkSession.builder \
    .appName("SECOP_FeatureEngineering") \
    .master("local[*]") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/29 01:26:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/29 01:26:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# Cargar datos
df = spark.read.parquet("/opt/spark-data/processed/secop_eda.parquet")
print(f"Registros cargados: {df.count():,}")

# %%
# Explorar columnas disponibles
print("Columnas disponibles:")
for col_name in df.columns:
    print(f"  - {col_name}")

                                                                                

Registros cargados: 1,000
Columnas disponibles:
  - referencia_del_contrato
  - nit_entidad
  - nombre_entidad
  - departamento
  - ciudad
  - tipo_de_contrato
  - valor_del_contrato
  - fecha_de_firma
  - estado_contrato
  - valor_del_contrato_num
  - fecha_de_firma_parsed
  - anio
  - mes


26/01/29 01:26:16 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [3]:
# Seleccionar features para el modelo
# Variables categóricas
categorical_cols = ["departamento", "tipo_de_contrato", "estado_contrato"]

# Variables numéricas
numeric_cols = ["plazo_de_ejec_del_contrato", "valor_del_contrato_num"]

# Verificar qué columnas existen
available_cat = [c for c in categorical_cols if c in df.columns]
available_num = [c for c in numeric_cols if c in df.columns]

print(f"Categóricas disponibles: {available_cat}")
print(f"Numéricas disponibles: {available_num}")

Categóricas disponibles: ['departamento', 'tipo_de_contrato', 'estado_contrato']
Numéricas disponibles: ['valor_del_contrato_num']


In [4]:
# %%
# Limpiar datos: eliminar nulos
df_clean = df.dropna(subset=available_cat + available_num)
print(f"Registros después de limpiar nulos: {df_clean.count():,}")

Registros después de limpiar nulos: 1,000


In [5]:
# Convierte strings a índices numéricos
indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_idx", handleInvalid="keep")
    for col in available_cat
]

In [7]:
print("StringIndexers creados:")
for idx in indexers:
    print(f"  - {idx.getInputCol()} -> {idx.getOutputCol()}")

# %%

StringIndexers creados:
  - departamento -> departamento_idx
  - tipo_de_contrato -> tipo_de_contrato_idx
  - estado_contrato -> estado_contrato_idx


In [10]:
# PASO 2: OneHotEncoder para generar variables dummy
encoders = [
    OneHotEncoder(inputCol=col + "_idx", outputCol=col + "_vec")
    for col in available_cat
]

In [11]:
print("\nOneHotEncoders creados:")
for enc in encoders:
    print(f"  - {enc.getInputCol()} -> {enc.getOutputCol()}")


OneHotEncoders creados:
  - departamento_idx -> departamento_vec
  - tipo_de_contrato_idx -> tipo_de_contrato_vec
  - estado_contrato_idx -> estado_contrato_vec


In [14]:
# PASO 3: VectorAssembler para combinar todas las features
# Combinamos: features numéricas + features categóricas codificadas
feature_cols = available_num + [col + "_vec" for col in available_cat]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_raw"
)

print(f"\nVectorAssembler combinará: {feature_cols}")


VectorAssembler combinará: ['valor_del_contrato_num', 'departamento_vec', 'tipo_de_contrato_vec', 'estado_contrato_vec']


In [15]:
# PASO 4: Construir Pipeline
# Pipeline = secuencia de transformaciones
pipeline_stages = indexers + encoders + [assembler]

pipeline = Pipeline(stages=pipeline_stages)

print(f"\nPipeline con {len(pipeline_stages)} stages:")
for i, stage in enumerate(pipeline_stages):
    print(f"  Stage {i+1}: {type(stage).__name__}")


Pipeline con 7 stages:
  Stage 1: StringIndexer
  Stage 2: StringIndexer
  Stage 3: StringIndexer
  Stage 4: OneHotEncoder
  Stage 5: OneHotEncoder
  Stage 6: OneHotEncoder
  Stage 7: VectorAssembler


In [16]:
# PASO 5: Entrenar el pipeline (fit)
# Nota: StringIndexer y OneHotEncoder necesitan "aprender" del dataset
print("\nEntrenando pipeline...")
pipeline_model = pipeline.fit(df_clean)
print("Pipeline entrenado exitosamente")


Entrenando pipeline...
Pipeline entrenado exitosamente


In [17]:

# %%
# PASO 6: Aplicar transformaciones (transform)
df_transformed = pipeline_model.transform(df_clean)

print("\nTransformación completada")
print(f"Columnas después de transformar: {len(df_transformed.columns)}")



Transformación completada
Columnas después de transformar: 20


In [20]:
df_transformed.select("features_raw").printSchema()

root
 |-- features_raw: vector (nullable = true)



In [21]:
df_transformed.select("features_raw").show()

+--------------------+
|        features_raw|
+--------------------+
|(54,[0,1,32,50],[...|
|(54,[0,28,32,45],...|
|(54,[0,1,32,44],[...|
|(54,[0,11,32,44],...|
|(54,[0,1,32,44],[...|
|(54,[0,14,32,46],...|
|(54,[0,1,32,48],[...|
|(54,[0,1,32,46],[...|
|(54,[0,1,32,46],[...|
|(54,[0,2,32,46],[...|
|(54,[0,2,32,45],[...|
|(54,[0,1,32,46],[...|
|(54,[0,6,32,46],[...|
|(54,[0,1,32,44],[...|
|(54,[2,32,49],[1....|
|(54,[0,8,32,46],[...|
|(54,[0,8,32,44],[...|
|(54,[0,16,32,48],...|
|(54,[0,1,32,46],[...|
|(54,[0,1,32,46],[...|
+--------------------+
only showing top 20 rows



In [22]:
# Ver dimensión del vector de features
sample_features = df_transformed.select("features_raw").first()[0]
print(f"Dimensión del vector de features: {len(sample_features)}")

# %%
# Mostrar ejemplo de transformación
df_transformed.select(
    available_cat[0] if available_cat else "id",
    available_cat[0] + "_idx" if available_cat else "id",
    available_cat[0] + "_vec" if available_cat else "id",
    "features_raw"
).show(5, truncate=True)

Dimensión del vector de features: 54
+--------------------+----------------+----------------+--------------------+
|        departamento|departamento_idx|departamento_vec|        features_raw|
+--------------------+----------------+----------------+--------------------+
|Distrito Capital ...|             0.0|  (31,[0],[1.0])|(54,[0,1,32,50],[...|
|              Arauca|            27.0| (31,[27],[1.0])|(54,[0,28,32,45],...|
|Distrito Capital ...|             0.0|  (31,[0],[1.0])|(54,[0,1,32,44],[...|
|                Meta|            10.0| (31,[10],[1.0])|(54,[0,11,32,44],...|
|Distrito Capital ...|             0.0|  (31,[0],[1.0])|(54,[0,1,32,44],[...|
+--------------------+----------------+----------------+--------------------+
only showing top 5 rows



In [23]:
# %%
# Guardar pipeline entrenado
pipeline_path = "/opt/spark-data/processed/feature_pipeline"
pipeline_model.save(pipeline_path)
print(f"\nPipeline guardado en: {pipeline_path}")

# %%
# Guardar dataset transformado
output_path = "/opt/spark-data/processed/secop_features.parquet"
df_transformed.write.mode("overwrite").parquet(output_path)
print(f"Dataset transformado guardado en: {output_path}")

26/01/29 01:39:37 WARN FileUtil: Failed to delete file or dir [/opt/spark-data/processed/feature_pipeline/metadata/_temporary]: it still exists.
26/01/29 01:39:42 WARN FileUtil: Failed to delete file or dir [/opt/spark-data/processed/feature_pipeline/stages/2_StringIndexer_33cedbd26317/metadata/_temporary]: it still exists.
26/01/29 01:39:44 WARN FileUtil: Failed to delete file or dir [/opt/spark-data/processed/feature_pipeline/stages/3_OneHotEncoder_b31de78142b9/metadata/_temporary]: it still exists.
26/01/29 01:39:46 WARN FileUtil: Failed to delete file or dir [/opt/spark-data/processed/feature_pipeline/stages/4_OneHotEncoder_352fd5ae0221/metadata/_temporary]: it still exists.
26/01/29 01:39:48 WARN FileUtil: Failed to delete file or dir [/opt/spark-data/processed/feature_pipeline/stages/5_OneHotEncoder_9bef111cff42/metadata/_temporary]: it still exists.
26/01/29 01:39:49 WARN FileUtil: Failed to delete file or dir [/opt/spark-data/processed/feature_pipeline/stages/5_OneHotEncoder_9b


Pipeline guardado en: /opt/spark-data/processed/feature_pipeline


                                                                                

Dataset transformado guardado en: /opt/spark-data/processed/secop_features.parquet
