##DAY 12: MLflow Basics & Machine Learning

In [0]:
spark.sql("USE CATALOG ecommerce_prod")
from pyspark.sql import functions as F

print("=== DAY 12: MLflow Basics & Machine Learning ===\n")

=== DAY 12: MLflow Basics & Machine Learning ===



## 1: Load & Combine Data (Bronze Layer)

In [0]:
print("1. LOADING & COMBINING DATA (Bronze Layer)")

# Load October and November data
oct_df = spark.read.options(
    header=True, 
    inferSchema=True
).csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")

nov_df = spark.read.options(
    header=True, 
    inferSchema=True
).csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv")

# Combine datasets
raw_df = oct_df.unionByName(nov_df)

print(f"✓ October data: {oct_df.count():,} rows")
print(f"✓ November data: {nov_df.count():,} rows")
print(f"✓ Combined data: {raw_df.count():,} rows")

# Create temp view
raw_df.createOrReplaceTempView("raw_events")
print("✓ Created temp view: raw_events")

1. LOADING & COMBINING DATA (Bronze Layer)
✓ October data: 42,448,764 rows
✓ November data: 67,501,979 rows
✓ Combined data: 109,950,743 rows
✓ Created temp view: raw_events


##2: Feature Engineering (Gold Layer)

In [0]:
print("\n2. FEATURE ENGINEERING (Gold Layer)")

# Create gold schema if not exists
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

# Aggregate product-level features for ML
gold_df = raw_df.groupBy("product_id").agg(
    F.count("event_type").alias("views"),
    F.sum((F.col("event_type") == 'cart').cast('int')).alias("cart_adds"),
    F.sum((F.col("event_type") == 'purchase').cast('int')).alias("purchases")
)

# Save to gold layer
gold_df.write.format("delta").mode("overwrite").saveAsTable("gold.product_ml_features")
print("✓ Created gold.product_ml_features table")
print(f"✓ Products with features: {gold_df.count():,}")

# Show sample features
print("\nSample of product features:")
gold_df.show(10)


2. FEATURE ENGINEERING (Gold Layer)
✓ Created gold.product_ml_features table
✓ Products with features: 206,876

Sample of product features:
+----------+------+---------+---------+
|product_id| views|cart_adds|purchases|
+----------+------+---------+---------+
|   1005159|178676|    10642|     3663|
|   5701087|  6734|       43|      100|
|  26402159|   269|        1|        3|
|  27300009|  1107|       19|        7|
|   8500290|  2664|      169|       55|
|   6902812|   881|       17|        8|
|  15200176|   714|       26|        8|
|  13800287|   334|       12|        1|
|  22500128|   179|        5|        1|
|   9800341|  1668|       58|       21|
+----------+------+---------+---------+
only showing top 10 rows


##3: Prepare Training Data

In [0]:
print("\n3. PREPARING TRAINING DATA")

# Load features from gold layer
ml_df = spark.table("gold.product_ml_features").toPandas()

# Prepare features (X) and target (y)
X = ml_df[["views", "cart_adds"]]
y = ml_df["purchases"]

print(f"✓ Total samples: {len(ml_df):,}")
print(f"✓ Features shape: {X.shape}")
print(f"✓ Target shape: {y.shape}")

# Basic statistics
print("\nFeature Statistics:")
print(f"Views - Mean: {X['views'].mean():.0f}, Max: {X['views'].max():,}")
print(f"Cart adds - Mean: {X['cart_adds'].mean():.0f}, Max: {X['cart_adds'].max():,}")
print(f"Purchases - Mean: {y.mean():.0f}, Max: {y.max():,}")


3. PREPARING TRAINING DATA
✓ Total samples: 206,876
✓ Features shape: (206876, 2)
✓ Target shape: (206876,)

Feature Statistics:
Views - Mean: 531, Max: 1,136,760
Cart adds - Mean: 19, Max: 133,328
Purchases - Mean: 8, Max: 61,265


##4: Train Models with MLflow Tracking

In [0]:
print("\n4. TRAINING MODELS WITH MLFLOW TRACKING")

import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Set up MLflow experiment
mlflow.set_experiment("/Shared/Day12_MLflow_Regression")
print("✓ MLflow experiment set: /Shared/Day12_MLflow_Regression")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"✓ Training samples: {X_train.shape[0]:,}")
print(f"✓ Test samples: {X_test.shape[0]:,}")


4. TRAINING MODELS WITH MLFLOW TRACKING


2026/01/20 12:12:52 INFO mlflow.tracking.fluent: Experiment with name '/Shared/Day12_MLflow_Regression' does not exist. Creating a new experiment.


✓ MLflow experiment set: /Shared/Day12_MLflow_Regression
✓ Training samples: 165,500
✓ Test samples: 41,376


##4a: Train Linear Regression

In [0]:
print("\n4a. TRAINING LINEAR REGRESSION")

with mlflow.start_run(run_name="linear_regression_v1"):
    # Log parameters
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)
    
    # Train model
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    # Evaluate
    lr_score = lr_model.score(X_test, y_test)
    mlflow.log_metric("r2_score", lr_score)
    
    # Log model with input example
    input_example = X_train.iloc[:5]
    mlflow.sklearn.log_model(
        lr_model, 
        "linear_regression_model",
        input_example=input_example
    )
    
    print(f"✓ Linear Regression R² Score: {lr_score:.4f}")



4a. TRAINING LINEAR REGRESSION




✓ Linear Regression R² Score: 0.9725


##4b: Train Random Forest (Comparison)


In [0]:
print("\n4b. TRAINING RANDOM FOREST (Comparison)")

with mlflow.start_run(run_name="random_forest_v1"):
    # Log parameters
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_estimators", 100)
    
    # Train model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Evaluate
    rf_score = rf_model.score(X_test, y_test)
    mlflow.log_metric("r2_score", rf_score)
    
    # Log model with input example
    input_example = X_train.iloc[:5]
    mlflow.sklearn.log_model(
        rf_model, 
        "random_forest_model",
        input_example=input_example
    )
    
    print(f"✓ Random Forest R² Score: {rf_score:.4f}")


4b. TRAINING RANDOM FOREST (Comparison)




✓ Random Forest R² Score: 0.9670


## 5: Model Comparison & Analysis

In [0]:
print("\n5. MODEL COMPARISON & ANALYSIS")

# Compare performance
print(f"\n{'='*50}")
print("MODEL COMPARISON RESULTS:")
print(f"{'='*50}")
print(f"Linear Regression R²: {lr_score:.4f}")
print(f"Random Forest R²:     {rf_score:.4f}")
print(f"{'='*50}")

if rf_score > lr_score:
    improvement = ((rf_score - lr_score) / lr_score) * 100
    print(f"Random Forest is {improvement:.1f}% better")
else:
    improvement = ((lr_score - rf_score) / rf_score) * 100
    print(f"Linear Regression is {improvement:.1f}% better")

# Feature importance (Random Forest)
if hasattr(rf_model, 'feature_importances_'):
    print("\nRandom Forest Feature Importance:")
    for feature, importance in zip(X.columns, rf_model.feature_importances_):
        print(f"  {feature}: {importance:.3f}")



5. MODEL COMPARISON & ANALYSIS

MODEL COMPARISON RESULTS:
Linear Regression R²: 0.9725
Random Forest R²:     0.9670
Linear Regression is 0.6% better

Random Forest Feature Importance:
  views: 0.268
  cart_adds: 0.732


##6: Generate Predictions & Insights

In [0]:
print("\n6. GENERATING PREDICTIONS & INSIGHTS")

# Make predictions on test set
y_pred_lr = lr_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Show sample predictions
print("Sample Predictions (first 5 test samples):")
print(f"{'Views':>10} {'Cart Adds':>10} {'Actual':>10} {'LR Pred':>10} {'RF Pred':>10}")
print("-" * 60)

for i in range(min(5, len(X_test))):
    print(f"{X_test.iloc[i,0]:>10} {X_test.iloc[i,1]:>10} {y_test.iloc[i]:>10.0f} "
          f"{y_pred_lr[i]:>10.0f} {y_pred_rf[i]:>10.0f}")


6. GENERATING PREDICTIONS & INSIGHTS
Sample Predictions (first 5 test samples):
     Views  Cart Adds     Actual    LR Pred    RF Pred
------------------------------------------------------------
        61          0          0          0          0
         1          0          0          0          0
        43          0          0          0          0
        65          2          0          1          0
       325          3          1          1          2
