#Day 12 of 14 Days Databricks Challenge

In [0]:
import mlflow
import mlflow.sklearn

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from datetime import datetime

def log(msg):
    print(f"[{datetime.now()}] ðŸ”¹ {msg}")


In [0]:
log("Loading gold_product_metrics table")

df = spark.table("default.gold_product_metrics").toPandas()

log(f"Total records loaded: {len(df)}")


In [0]:
log("Preparing features and target")

X = df[["views"]]
y = df["purchases"]


In [0]:
log("Splitting data into train and test")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

log(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

In [0]:
log("Starting MLflow run")

with mlflow.start_run(run_name="linear_regression_views_to_purchases"):
    log("Logging model parameters")
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("feature", "views")
    mlflow.log_param("test_size", 0.2)

    log("Training Linear Regression model")
    model = LinearRegression()
    model.fit(X_train, y_train)

    log("Evaluating model")
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    mlflow.log_metric("r2_score", score)

    log("Logging model to MLflow")
    mlflow.sklearn.log_model(model, artifact_path="model")
   
log("MLflow run completed")
print(f"RÂ² Score: {score:.4f}")

