In [0]:
# =============================================
# DAY 13: Model Comparison & Spark ML Pipelines
# =============================================

# Setup
spark.sql("USE CATALOG ecommerce_prod")
from pyspark.sql import functions as F

print("=== DAY 13: Model Comparison & Spark ML Pipelines ===\n")

=== DAY 13: Model Comparison & Spark ML Pipelines ===



##1: Create ML Catalog & Volume (Unity Catalog)

In [0]:
print("1. CREATING ML CATALOG & VOLUME (Unity Catalog)")

# Create ML-specific catalog and schema
spark.sql("CREATE CATALOG IF NOT EXISTS ml_catalog")
spark.sql("USE CATALOG ml_catalog")
spark.sql("CREATE SCHEMA IF NOT EXISTS ml_schema")

# Create volume for MLflow temporary storage
spark.sql("CREATE VOLUME IF NOT EXISTS ml_schema.mlflow_tmp")

print("✓ Created ml_catalog.ml_schema")
print("✓ Created volume: ml_schema.mlflow_tmp")

# Switch back to ecommerce catalog for data
spark.sql("USE CATALOG ecommerce_prod")

1. CREATING ML CATALOG & VOLUME (Unity Catalog)
✓ Created ml_catalog.ml_schema
✓ Created volume: ml_schema.mlflow_tmp


DataFrame[]

##2: Load & Prepare Training Data (Scikit-learn)

In [0]:
print("\n2. PREPARING TRAINING DATA (Scikit-learn)")

# Load product ML features
df = spark.table("gold.product_ml_features").toPandas()

# Prepare features and target
X = df[["views", "cart_adds"]]
y = df["purchases"]

print(f"✓ Total samples: {len(df):,}")
print(f"✓ Features: views, cart_adds")
print(f"✓ Target: purchases")

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"✓ Training samples: {X_train.shape[0]:,}")
print(f"✓ Test samples: {X_test.shape[0]:,}")

# Convert to float64 to avoid MLflow schema warnings
X_train_f = X_train.astype("float64")
X_test_f = X_test.astype("float64")


2. PREPARING TRAINING DATA (Scikit-learn)
✓ Total samples: 206,876
✓ Features: views, cart_adds
✓ Target: purchases
✓ Training samples: 165,500
✓ Test samples: 41,376


##3: MLflow Setup for Model Comparison

In [0]:
print("\n3. SETTING UP MLFLOW EXPERIMENT")

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

mlflow.set_experiment("/Shared/Day13_Model_Comparison")
print("✓ MLflow experiment set: /Shared/Day13_Model_Comparison")




3. SETTING UP MLFLOW EXPERIMENT
✓ MLflow experiment set: /Shared/Day13_Model_Comparison


##4: Train & Compare 3 Scikit-learn Models

In [0]:

print("\n4. TRAINING & COMPARING 3 MODELS (Scikit-learn)")

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Define models to compare
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42)
}

# Dictionary to store results
results = {}

# Train and log each model
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    with mlflow.start_run(run_name=model_name):
        # Log parameters
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("random_state", 42)
        
        if model_name == "DecisionTree":
            mlflow.log_param("max_depth", 5)
        elif model_name == "RandomForest":
            mlflow.log_param("n_estimators", 100)
        
        # Train model
        model.fit(X_train_f, y_train)
        
        # Evaluate
        r2_score = model.score(X_test_f, y_test)
        mlflow.log_metric("r2_score", r2_score)
        results[model_name] = r2_score
        
        # Create signature for better MLflow tracking
        input_example = X_train_f.iloc[:5]
        predictions = model.predict(input_example)
        signature = infer_signature(input_example, predictions)
        
        # Log model
        mlflow.sklearn.log_model(
            model,
            artifact_path=f"{model_name.lower()}_model",
            signature=signature,
            input_example=input_example
        )
        
        print(f"✓ {model_name} R² Score: {r2_score:.4f}")


4. TRAINING & COMPARING 3 MODELS (Scikit-learn)

Training LinearRegression...
✓ LinearRegression R² Score: 0.9725

Training DecisionTree...
✓ DecisionTree R² Score: 0.8380

Training RandomForest...
✓ RandomForest R² Score: 0.9670


## 5: Compare Model Performance

In [0]:
print("\n5. MODEL COMPARISON RESULTS")
print("="*50)
print(f"{'Model':<20} {'R² Score':<15} {'Training Time':<15}")
print("-"*50)

# Simple performance summary 
best_model = max(results, key=results.get)
best_score = results[best_model]

for model_name, score in results.items():
    print(f"{model_name:<20} {score:<15.4f} {'-':<15}")

print("="*50)
print(f"✓ Best Model: {best_model} (R²: {best_score:.4f})")
print("="*50)


5. MODEL COMPARISON RESULTS
Model                R² Score        Training Time  
--------------------------------------------------
LinearRegression     0.9725          -              
DecisionTree         0.8380          -              
RandomForest         0.9670          -              
✓ Best Model: LinearRegression (R²: 0.9725)


##6: Build Spark ML Pipeline (Scalable Version)

In [0]:
print("\n6. BUILDING SPARK ML PIPELINE (Scalable)")

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR

# Load data as Spark DataFrame
spark_df = spark.table("gold.product_ml_features")

# Cast to double for ML algorithms
spark_df = spark_df.select(
    F.col("product_id"),
    F.col("views").cast("double").alias("views"),
    F.col("cart_adds").cast("double").alias("cart_adds"),
    F.col("purchases").cast("double").alias("purchases")
)

print(f"✓ Spark DataFrame loaded: {spark_df.count():,} rows")

# Build pipeline
assembler = VectorAssembler(
    inputCols=["views", "cart_adds"],
    outputCol="features"
)

spark_lr = SparkLR(
    featuresCol="features",
    labelCol="purchases",
    maxIter=10,
    regParam=0.01
)

pipeline = Pipeline(stages=[assembler, spark_lr])

# Split data
train_df, test_df = spark_df.randomSplit([0.8, 0.2], seed=42)

print(f"✓ Training samples: {train_df.count():,}")
print(f"✓ Test samples: {test_df.count():,}")

# Train pipeline
spark_model = pipeline.fit(train_df)
print("✓ Spark ML Pipeline trained successfully")

# Evaluate Spark model
spark_predictions = spark_model.transform(test_df)
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="purchases",
    predictionCol="prediction",
    metricName="r2"
)

spark_r2 = evaluator.evaluate(spark_predictions)
print(f"✓ Spark Linear Regression R²: {spark_r2:.4f}")



6. BUILDING SPARK ML PIPELINE (Scalable)
✓ Spark DataFrame loaded: 206,876 rows
✓ Training samples: 165,630
✓ Test samples: 41,246
✓ Spark ML Pipeline trained successfully
✓ Spark Linear Regression R²: 0.9908


##7: Log Spark Model to MLflow (Serverless-Safe)

In [0]:
print("\n7. LOGGING SPARK MODEL TO MLFLOW")

# Create sample for signature inference
sample = spark_df.select(
    F.col("views").cast("double"),
    F.col("cart_adds").cast("double")
).limit(20)

# Get predictions for sample
pred = spark_model.transform(sample).select("prediction")

# Infer signature
signature = infer_signature(sample.toPandas(), pred.toPandas())

# Log to MLflow with Unity Catalog volume
with mlflow.start_run(run_name="Spark_LR_Pipeline"):
    mlflow.log_param("model_type", "SparkLinearRegressionPipeline")
    mlflow.log_param("maxIter", 10)
    mlflow.log_param("regParam", 0.01)
    mlflow.log_metric("r2_score", spark_r2)
    
    mlflow.spark.log_model(
        spark_model,
        artifact_path="spark_pipeline_model",
        signature=signature,
        input_example=sample.toPandas(),
        dfs_tmpdir="/Volumes/ml_catalog/ml_schema/mlflow_tmp"
    )
    
    print("✓ Spark ML Pipeline logged to MLflow")
    print("✓ Using Unity Catalog volume for temporary storage")



7. LOGGING SPARK MODEL TO MLFLOW


  "dataframe_split": {
    "columns": [
      "views",
      "cart_adds"
    ],
    "data": [
      [
        178676.0,
        10642.0
      ],
      [
        6734.0,
        43.0
      ],
      [
        269.0,
        1.0
      ],
      [
        1107.0,
        19.0
      ],
      [
        2664.0,
        169.0
      ],
      [
        881.0,
        17.0
      ],
      [
        714.0,
        26.0
      ],
      [
        334.0,
        12.0
      ],
      [
        179.0,
        5.0
      ],
      [
        1668.0,
        58.0
      ],
      [
        868.0,
        3.0
      ],
      [
        1915.0,
        45.0
      ],
      [
        3064.0,
        53.0
      ],
      [
        56.0,
        0.0
      ],
      [
        19.0,
        0.0
      ],
      [
        295.0,
        3.0
      ],
      [
        1475.0,
        60.0
      ],
      [
        992.0,
        12.0
      ],
      [
        402.0,
        2.0
      ],
      [
        74.0,
        1.0
      ]
    

✓ Spark ML Pipeline logged to MLflow
✓ Using Unity Catalog volume for temporary storage


##8: Compare All Models

In [0]:
print("\n8. FINAL MODEL COMPARISON")
print("="*60)
print(f"{'Model Type':<25} {'Framework':<15} {'R² Score':<15}")
print("-"*60)

# Add Spark model to comparison
results["SparkLinearRegression"] = spark_r2

for model_name, score in results.items():
    framework = "Scikit-learn" if model_name != "SparkLinearRegression" else "Spark ML"
    print(f"{model_name:<25} {framework:<15} {score:<15.4f}")

print("="*60)

# Determine best overall model
overall_best = max(results, key=results.get)
print(f"✓ Overall Best Model: {overall_best} (R²: {results[overall_best]:.4f})")


8. FINAL MODEL COMPARISON
Model Type                Framework       R² Score       
------------------------------------------------------------
LinearRegression          Scikit-learn    0.9725         
DecisionTree              Scikit-learn    0.8380         
RandomForest              Scikit-learn    0.9670         
SparkLinearRegression     Spark ML        0.9908         
✓ Overall Best Model: SparkLinearRegression (R²: 0.9908)


##SUMMARY

In [0]:

print("\n" + "="*60)
print("DAY 13 COMPLETED: Model Comparison & Spark ML Pipelines")
print("="*60)
print("✓ 1. Created ML catalog & volume in Unity Catalog")
print("✓ 2. Prepared training data (scikit-learn)")
print("✓ 3. Set up MLflow experiment for comparison")
print("✓ 4. Trained & compared 3 scikit-learn models")
print("✓ 5. Built scalable Spark ML pipeline")
print("✓ 6. Logged Spark model to MLflow with UC volume")
print("✓ 7. Compared all 4 models performance")
print("✓ 8. Best model identified: {}".format(overall_best))
print("="*60)


DAY 13 COMPLETED: Model Comparison & Spark ML Pipelines
✓ 1. Created ML catalog & volume in Unity Catalog
✓ 2. Prepared training data (scikit-learn)
✓ 3. Set up MLflow experiment for comparison
✓ 4. Trained & compared 3 scikit-learn models
✓ 5. Built scalable Spark ML pipeline
✓ 6. Logged Spark model to MLflow with UC volume
✓ 7. Compared all 4 models performance
✓ 8. Best model identified: SparkLinearRegression


In [0]:
# Check if predictions are reasonable
sample_results = spark_model.transform(spark_df.limit(5))
sample_results.select("views", "cart_adds", "purchases", "prediction").show()

# Check prediction range
predictions = spark_model.transform(spark_df)
pred_stats = predictions.select(
    F.min("prediction").alias("min_pred"),
    F.max("prediction").alias("max_pred"),
    F.avg("prediction").alias("avg_pred")
).collect()

print(f"\nPrediction Range: {pred_stats[0]['min_pred']:.2f} to {pred_stats[0]['max_pred']:.2f}")
print(f"Average Prediction: {pred_stats[0]['avg_pred']:.2f}")
print(f"Actual Purchase Range: {y.min()} to {y.max()}")

+--------+---------+---------+------------------+
|   views|cart_adds|purchases|        prediction|
+--------+---------+---------+------------------+
|178676.0|  10642.0|   3663.0|4454.9553568537885|
|  6734.0|     43.0|    100.0| 9.271401167031895|
|   269.0|      1.0|      3.0|0.3424399207831458|
|  1107.0|     19.0|      7.0| 7.072282776367371|
|  2664.0|    169.0|     55.0|  71.3052441277684|
+--------+---------+---------+------------------+


Prediction Range: -44.93 to 57465.28
Average Prediction: 7.99
Actual Purchase Range: 0 to 61265
