In [0]:
from pyspark.sql import functions as F

silver = spark.table("ecommerce.silver.events")

gold = silver.groupBy("user_id", "product_id").agg(
    F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("views"),
    F.sum(F.when(F.col("event_type") == "cart", 1).otherwise(0)).alias("cart_adds"),
    F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchases"),
    F.sum(F.when(F.col("event_type") == "purchase", F.col("price")).otherwise(0)).alias("revenue")
)

gold.write.mode("overwrite").option("mergeSchema", "true").saveAsTable("ecommerce.gold.products")


In [0]:
spark.table("ecommerce.gold.products").count()
#spark.table("ecommerce.gold.products").show(5)


3

In [0]:
df = spark.table("ecommerce.gold.products").toPandas()


In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn

X = df[["views", "cart_adds"]]
y = df["purchases"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=50, random_state=42)
}

for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model"):
        mlflow.log_param("model_type", name)

        model.fit(X_train, y_train)
        r2 = model.score(X_test, y_test)

        mlflow.log_metric("r2_score", r2)
        mlflow.sklearn.log_model(
            model,
            "model",
            input_example=X_train.iloc[:2]
        )

        print(f"✅ {name}: R² = {r2:.4f}")




✅ linear: R² = nan




✅ decision_tree: R² = nan




✅ random_forest: R² = nan
