### Day 13

#### Learning Agenda - Spark ML Pipelines
Compare sklearn models with Spark ML model

In [0]:
# sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from pyspark.ml.regression import LinearRegression as SparkLR
from pyspark.sql.functions import col

# MLflow libraries
import mlflow
import mlflow.sklearn

# Spark ML Pipeline
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler # Spark ML models expect a single feature vector combined from all the features

Load data as a Pandas DataFrame

In [0]:
df = spark.table("ecommerce_catalog.gold.category_performance") \
                 .filter(col("total_revenue").isNotNull()).toPandas()


Define features & target

In [0]:
# Features and target
X = df[["unique_views", "unique_carts", "unique_purchases"]]
y = df["total_revenue"]

Split data into train & test

In [0]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [0]:
models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=100)
}


MLflow

In [0]:
for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model"):
        mlflow.log_param("model_type", name)
        model.fit(X_train, y_train)  # Train model
        score = model.score(X_test, y_test)  # Evaluate model
        mlflow.log_metric("r2_score", score)  # Log metric
        mlflow.sklearn.log_model(model, "model")  # Log trained model
        print(f"{name}: R² = {score:.4f}")




linear: R² = 0.6927




decision_tree: R² = 0.2381




random_forest: R² = 0.1638


Spark Session (if not already running)

In [0]:
# Initialize Spark session if not already running
spark = SparkSession.builder \
    .appName("Day13_ML_Pipeline") \
    .getOrCreate()


Load Spark Table (Spark ML pipelines need Spark DataFrames)

In [0]:
spark_df = spark.table("ecommerce_catalog.gold.category_performance") \
                 .filter(col("total_revenue").isNotNull())

Split Spark Data

In [0]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=42)


Define Spark ML Pipeline Stages

In [0]:
# Assemble features
assembler = VectorAssembler(inputCols=["unique_views","unique_carts","unique_purchases"], outputCol="features")

# Linear Regression model
lr = SparkLR(featuresCol="features", labelCol="total_revenue")

# Combine stages into a pipeline
pipeline = Pipeline(stages=[assembler, lr])

Fit Spark Pipeline & produce a PipelineModel (Transformer)

In [0]:
model = pipeline.fit(train)  # Train entire pipeline


Make Predictions (Spark)

In [0]:
predictions = model.transform(test)
predictions.show(5)  # Preview predictions


+--------------------+------------+------------+----------------+------------------+----------------------+-------------------+------------------+
|       category_code|unique_views|unique_carts|unique_purchases|     total_revenue|cart_to_purchase_ratio|           features|        prediction|
+--------------------+------------+------------+----------------+------------------+----------------------+-------------------+------------------+
|       apparel.jeans|       31426|           0|             137|           6163.23|                   0.0|[31426.0,0.0,137.0]|187998.26583018666|
|       apparel.shirt|        8815|           0|              35|1940.5700000000002|                   0.0|  [8815.0,0.0,35.0]| 40562.44919187637|
|apparel.shoes.bal...|        1596|           0|              11|314.55999999999995|                   0.0|  [1596.0,0.0,11.0]|-7459.886155910646|
|    apparel.trousers|       10737|           0|              22|           1102.31|                   0.0| [10737.0,0

Evaluate Spark Model

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="total_revenue", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print(f"Spark Pipeline R² = {r2:.4f}")


Spark Pipeline R² = 0.6736
