#Day 13 of 14 Days Databricks Challenge

In [0]:
from datetime import datetime
import mlflow
import mlflow.sklearn

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

def log(msg):
    print(f"[{datetime.now()}] ðŸ”¹ {msg}")


In [0]:
log("Loading Gold data for sklearn models")

df = spark.table("default.gold_product_metrics").toPandas()

X = df[["views"]]
y = df["purchases"]

log(f"Dataset size: {len(df)}")


In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

log("Train-test split completed")


In [0]:
models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=100, random_state=42)
}


In [0]:
for name, model in models.items():
    log(f"Starting MLflow run for {name}")

    with mlflow.start_run(run_name=f"{name}_model"):
        mlflow.log_param("model_type", name)

        log("Training model")
        model.fit(X_train, y_train)

        log("Evaluating model")
        y_pred = model.predict(X_test)
        score = r2_score(y_test, y_pred)

        mlflow.log_metric("r2_score", score)

        log("Logging model artifact")
        mlflow.sklearn.log_model(model, "model")

        print(f"{name}: RÂ² = {score:.4f}")

    log(f"Completed run for {name}")


In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR


In [0]:
log("Loading Gold data for Spark ML pipeline")

spark_df = spark.table("default.gold_product_metrics")
spark_df.printSchema()


In [0]:
log("Creating feature vector")

assembler = VectorAssembler(
    inputCols=["views"],
    outputCol="features"
)


In [0]:
lr = SparkLR(
    featuresCol="features",
    labelCol="purchases"
)


In [0]:
pipeline = Pipeline(stages=[assembler, lr])


In [0]:
log("Splitting Spark data")

train_df, test_df = spark_df.randomSplit([0.8, 0.2], seed=42)


In [0]:
log("Training Spark ML pipeline")

spark_model = pipeline.fit(train_df)

log("Spark ML pipeline training completed")


In [0]:
log("Running predictions on test data")

predictions = spark_model.transform(test_df)

predictions.select("views", "purchases", "prediction").show(5)


In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

log("Evaluating Spark ML model")

evaluator = RegressionEvaluator(
    labelCol="purchases",
    predictionCol="prediction",
    metricName="r2"
)

r2 = evaluator.evaluate(predictions)
log(f"Spark ML RÂ² Score: {r2}")


In [0]:
log("Saving Spark ML pipeline to Volume")

spark_model.write() \
    .overwrite() \
    .save("/Volumes/workspace/ecommerce/ecommerce_data/models/spark_lr_pipeline")

log("Spark ML pipeline saved successfully")
