In [0]:
from pyspark.sql import Row
from pyspark.sql import functions as F

data = [
    Row(user_id="u1", product_id="p1", event_type="view", event_time="2024-01-01 10:00:00", price=None, user_session="s1"),
    Row(user_id="u1", product_id="p1", event_type="purchase", event_time="2024-01-01 10:05:00", price=25.0, user_session="s1"),
    Row(user_id="u2", product_id="p2", event_type="view", event_time="2024-01-02 11:00:00", price=None, user_session="s2"),
    Row(user_id="u2", product_id="p2", event_type="purchase", event_time="2024-01-02 11:10:00", price=60.0, user_session="s2"),
    Row(user_id="u3", product_id="p3", event_type="view", event_time="2024-01-03 12:00:00", price=None, user_session="s3")
]

raw_df = spark.createDataFrame(data) \
    .withColumn("event_time", F.to_timestamp("event_time")) \
    .withColumn("ingestion_ts", F.current_timestamp())

raw_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("ecommerce.bronze.bronze_events")


In [0]:
%sql
SELECT * FROM ecommerce.bronze.bronze_events;


user_id,product_id,event_type,event_time,price,user_session,ingestion_ts
u1,p1,view,2024-01-01T10:00:00.000Z,,s1,2026-01-20T17:46:35.713Z
u1,p1,purchase,2024-01-01T10:05:00.000Z,25.0,s1,2026-01-20T17:46:35.713Z
u2,p2,view,2024-01-02T11:00:00.000Z,,s2,2026-01-20T17:46:35.713Z
u2,p2,purchase,2024-01-02T11:10:00.000Z,60.0,s2,2026-01-20T17:46:35.713Z
u3,p3,view,2024-01-03T12:00:00.000Z,,s3,2026-01-20T17:46:35.713Z


In [0]:
from pyspark.sql import functions as F

bronze_df = spark.table("ecommerce.bronze.bronze_events")

silver_df = (
    bronze_df
    .filter(F.col("user_id").isNotNull())
    .filter(F.col("event_time").isNotNull())
    .withColumn("price", F.coalesce(F.col("price"), F.lit(0.0)))
    .withColumn("event_date", F.to_date("event_time"))
    .withColumn(
        "price_tier",
        F.when(F.col("price") < 10, "budget")
         .when(F.col("price") < 50, "mid")
         .otherwise("premium")
    )
)

silver_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("ecommerce.silver.events")


In [0]:
%sql
SELECT COUNT(*) FROM ecommerce.silver.events;


COUNT(*)
5


In [0]:
from pyspark.sql import functions as F

silver_df = spark.table("ecommerce.silver.events")

gold_df = (
    silver_df
    .groupBy("event_date", "product_id")
    .agg(
        F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("views"),
        F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchases"),
        F.sum(F.when(F.col("event_type") == "purchase", F.col("price")).otherwise(0)).alias("revenue")
    )
    .filter(F.col("views") > 0)
    .withColumn("conversion_rate", F.col("purchases") / F.col("views"))
)

gold_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("ecommerce.gold.products")


In [0]:
%sql
SELECT * FROM ecommerce.gold.products;


event_date,product_id,views,purchases,revenue,conversion_rate
2024-01-03,p3,1,0,0.0,0.0
2024-01-02,p2,1,1,60.0,1.0
2024-01-01,p1,1,1,25.0,1.0


In [0]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Load Gold data
df = spark.table("ecommerce.gold.products").toPandas()

# Safety check
assert len(df) > 0, "Gold table is empty"

# Features & label
X = df[["views", "revenue"]]
y = df["purchases"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [0]:
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np


In [0]:
with mlflow.start_run(run_name="linear_regression_v1"):

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("features", "views,revenue")
    mlflow.log_param("test_size", 0.2)

    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Metrics
    if len(y_test) >= 2:
        r2 = r2_score(y_test, y_pred)
        mlflow.log_metric("r2_score", r2)
        print(f"R² Score: {r2:.4f}")
    else:
        mae = mean_absolute_error(y_test, y_pred)
        mlflow.log_metric("mae", mae)
        print(f"Too few samples for R², MAE: {mae:.4f}")

    # Log model with input example
    input_example = X_train.iloc[:1]
    mlflow.sklearn.log_model(
        model,
        artifact_path="model",
        input_example=input_example
    )


Too few samples for R², MAE: 1.0000




In [0]:
X = df[["views", "conversion_rate"]]
