In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import mlflow
import mlflow.sklearn

### Convert Spark â†’ Pandas (for sklearn)

In [0]:
df = spark.table("workspace.ecommerce.ecommerce_events_delta")

purchase_df = (
    df
    .select("price", "event_type")
    .withColumn(
        "label", (df.event_type == "purchase").cast("int")
    ).dropna().toPandas()
)

### Create label & features

In [0]:
X = purchase_df[["price"]]
y = purchase_df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Training Logistic Regression model + log everything

In [0]:
with mlflow.start_run(run_name="logistic_reg_v1"):

    # Parameters
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("features", "price")

    # Train
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predict & evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Metric
    mlflow.log_metric("accuracy", accuracy)

    # Log model
    mlflow.sklearn.log_model(model, "model")

In [0]:
print(f"Model Accuracy: {accuracy:.4f}")

### Registering the model

In [0]:
from mlflow.models import infer_signature

# Infer signature from training data
signature = infer_signature(X_train, model.predict(X_train))

mlflow.sklearn.log_model(
    model,
    artifact_path="model",
    signature=signature,
    registered_model_name="ecommerce_purchase_logreg"
)