In [0]:
# import findspark
# findspark.init('/spark-3.5.1-bin-hadoop3')
from pyspark import *
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType, TimestampType, LongType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DecimalType
from pyspark.sql.functions import regexp_extract, split, from_unixtime, col, avg, min, max, desc, isnan, when, count
from pyspark.sql.functions import grouping, explode, array_contains
from pyspark.sql.functions import mean, stddev, skewness, kurtosis
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, Normalizer, Imputer
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import matplotlib.pyplot as plt
import pandas as pd
import random


# spark = SparkSession.builder.appName("analytics").getOrCreate()

# Pipelines de Spark MlLib

![Pipeline Spark MlLib](pipeline.png)


## Ingestión de Datos:

Los datos se cargan en un DataFrame.

In [0]:
# Cargar el conjunto de datos
data_path = "data/sf-airbnb-clean.parquet"
df = spark.read.parquet(data_path, header=True, inferSchema=True)

## Exploración de Datos:

* df.describe(): Proporciona un resumen estadístico de los datos.
* df.show(): Muestra las primeras filas del DataFrame para inspección visual.


In [0]:
# Exploración de datos
#Utilizamos dos columnas de lo contrario es ilegible
df.select("bedrooms","review_scores_rating").describe().show()

In [0]:
# Exploración de datos
df.select(
    [count(when(isnan(c), c)).alias(c) for c in df.columns]
).show()

In [0]:
# Exploración de datos
df.show(5)

## Limpieza de Datos

* df.dropna(): Elimina filas con valores nulos para asegurar la integridad de los datos.
* df.fillna(): Rellena valores nulos con un valor específico, como la mediana o media de la columna.

## Imputación de Valores Faltantes

* [Imputer](https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.ml.feature.Imputer.html): Sustituye valores faltantes en las columnas seleccionadas con la mediana o media de la columna.

In [0]:
# Limpieza de datos
df = df.dropna(how="any")

# Imputación de valores faltantes
imputer = Imputer(inputCols=["bedrooms", "bathrooms", "beds", "review_scores_rating", 
                             "review_scores_accuracy", "review_scores_cleanliness", 
                             "review_scores_checkin", "review_scores_communication", 
                             "review_scores_location", "review_scores_value"],
                  outputCols=["bedrooms_imputed", "bathrooms_imputed", "beds_imputed", 
                              "review_scores_rating_imputed", "review_scores_accuracy_imputed", 
                              "review_scores_cleanliness_imputed", "review_scores_checkin_imputed", 
                              "review_scores_communication_imputed", "review_scores_location_imputed", 
                              "review_scores_value_imputed"]).setStrategy("median")
df = imputer.fit(df).transform(df)


df.show(5)

## StringIndexer

Convierte cadenas de caracteres en índices numéricos, facilitando la manipulación de datos categóricos.

In [0]:
# Transformaciones categóricas
indexer_host_is_superhost = StringIndexer(inputCol="host_is_superhost", outputCol="host_is_superhost_indexed")
indexer_cancellation_policy = StringIndexer(inputCol="cancellation_policy", outputCol="cancellation_policy_indexed")
indexer_instant_bookable = StringIndexer(inputCol="instant_bookable", outputCol="instant_bookable_indexed")
indexer_neighbourhood_cleansed = StringIndexer(inputCol="neighbourhood_cleansed", outputCol="neighbourhood_cleansed_indexed")
indexer_property_type = StringIndexer(inputCol="property_type", outputCol="property_type_indexed")
indexer_room_type = StringIndexer(inputCol="room_type", outputCol="room_type_indexed")
indexer_bed_type = StringIndexer(inputCol="bed_type", outputCol="bed_type_indexed")


## OneHotEncoder

Convierte variables categóricas en una representación numérica, lo que es crucial para el procesamiento de modelos de ML.

In [0]:
encoder_host_is_superhost = OneHotEncoder(inputCol="host_is_superhost_indexed", outputCol="host_is_superhost_encoded")
encoder_cancellation_policy = OneHotEncoder(inputCol="cancellation_policy_indexed", outputCol="cancellation_policy_encoded")
encoder_instant_bookable = OneHotEncoder(inputCol="instant_bookable_indexed", outputCol="instant_bookable_encoded")
encoder_neighbourhood_cleansed = OneHotEncoder(inputCol="neighbourhood_cleansed_indexed", outputCol="neighbourhood_cleansed_encoded")
encoder_property_type = OneHotEncoder(inputCol="property_type_indexed", outputCol="property_type_encoded")
encoder_room_type = OneHotEncoder(inputCol="room_type_indexed", outputCol="room_type_encoded")
encoder_bed_type = OneHotEncoder(inputCol="bed_type_indexed", outputCol="bed_type_encoded")

## VectorAssembler

Combina varias columnas de características en un solo vector de características, esencial para el entrenamiento del modelo.

In [0]:
# VectorAssembler para combinar todas las características
assembler = VectorAssembler(inputCols=["bedrooms_imputed", "bathrooms_imputed", "beds_imputed", 
                                       "review_scores_rating_imputed", "review_scores_accuracy_imputed", 
                                       "review_scores_cleanliness_imputed", "review_scores_checkin_imputed", 
                                       "review_scores_communication_imputed", "review_scores_location_imputed", 
                                       "review_scores_value_imputed", "host_total_listings_count", 
                                       "latitude", "longitude", "accommodates", "minimum_nights", 
                                       "number_of_reviews", "host_is_superhost_encoded", 
                                       "cancellation_policy_encoded", "instant_bookable_encoded", 
                                       "neighbourhood_cleansed_encoded", "property_type_encoded", 
                                       "room_type_encoded", "bed_type_encoded"],
                            outputCol="features")

## Normalizer

Normaliza los datos, ajustando los valores de las características a una escala común sin distorsionar las diferencias en los rangos de valores.

In [0]:
normalizer = Normalizer(inputCol="features", outputCol="normFeatures")


## RandomForestRegressor

Entrena el modelo utilizando el algoritmo de Random Forest, ideal para tareas de regresión debido a su capacidad para manejar grandes conjuntos de datos y su robustez frente a datos ruidosos.

In [0]:
rf = RandomForestRegressor(featuresCol="normFeatures", labelCol="price")

## Creación del Pipeline

Crea un pipeline que incluye todos los pasos anteriores


In [0]:
# Crear el pipeline
pipeline = Pipeline(stages=[imputer, indexer_host_is_superhost, indexer_cancellation_policy, 
                            indexer_instant_bookable, indexer_neighbourhood_cleansed, indexer_property_type, 
                            indexer_room_type, indexer_bed_type, encoder_host_is_superhost, encoder_cancellation_policy, 
                            encoder_instant_bookable, encoder_neighbourhood_cleansed, encoder_property_type, 
                            encoder_room_type, encoder_bed_type, assembler, normalizer, rf])


## Entrenamiento del modelo

* Divide los datos en conjuntos de entrenamiento y prueba.
* Entrena el modelo con los datos de entrenamiento

In [0]:
# Dividir los datos en conjuntos de entrenamiento y prueba
train_data, test_data = df.randomSplit([0.8, 0.2])

# Ajustar el modelo
model = pipeline.fit(train_data)

## Evaluación del modelo

* Realiza predicciones en el conjunto de prueba.
* Evalúa el modelo utilizando la métrica de RMSE.

In [0]:
# Hacer predicciones
predictions = model.transform(test_data)

# Evaluar el modelo
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) en el conjunto de prueba: {rmse}")


## Guardar y cargar el modelo

In [0]:
# Guardar el modelo
pipelinePath = "data/lr-pipeline-model"
pipelineModel.write().overwrite().save(pipelinePath)


# Cargar el modelo
from pyspark.ml import PipelineModel 
savedPipelineModel = PipelineModel.load(pipelinePath)