# Notebook 4: Model Deployment

### AutoML from the previous notebook gave us the insight that a random forest regressor with the parameters inserted below is a strong modeling choice. We will go with that and deploy this model.

In [0]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import os

# 1. Get your username (email) programmatically
username = spark.sql("SELECT current_user()").collect()[0][0]

# 2. Define a path in your Workspace (NOT in the Repo)
experiment_name = "ames_housing_mlops"
experiment_path = f"/Users/{username}/{experiment_name}"

# 3. Set the experiment
mlflow.set_experiment(experiment_path)


In [0]:
# The '..' goes up one level from 'notebooks/', then into 'data/'
input_path = "../data/ames_preprocessed_numeric_unscaled.csv"

# Load the CSV
df = pd.read_csv(input_path)

# Verification
print(f"Successfully loaded data from: {os.path.abspath(input_path)}")
print(df.head())

# Define target column
target = "SalePrice_log"

# Split features and target
X = df.drop(columns=[target])
y = df[target]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Quick check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [0]:
# Use relative paths so it works on GitHub clones
test_features_path = "../data/X_test.csv"
test_target_path = "../data/y_test.csv"

# Save features and target separately
X_test.to_csv(test_features_path, index=False)
y_test.to_csv(test_target_path, index=False)

# Verification: Print the absolute path to confirm they landed in the right spot
print(f"Test features saved to: {os.path.abspath(test_features_path)}")
print(f"Test target saved to:   {os.path.abspath(test_target_path)}")

In [0]:
from mlflow.models.signature import infer_signature
from sklearn.ensemble import RandomForestRegressor
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

# Initialize the client
client = MlflowClient()

# Example using AutoML-best parameters
rf = RandomForestRegressor(
    max_features=0.9269,
    max_leaf_nodes=357,
    n_estimators=8,
    n_jobs=-1,
    random_state=42
)

# Fit on training data
rf.fit(X_train, y_train)

# Infer model signature based on training data and predictions
train_preds = rf.predict(X_train)
signature = infer_signature(X_train, train_preds)

# Start MLflow run
with mlflow.start_run(run_name="rf_ames_model") as run:
    # Log model with signature
    mlflow.sklearn.log_model(rf, "rf_model", signature=signature)
    
    # Log metrics
    test_preds = rf.predict(X_test)
    rmse = ((test_preds - y_test)**2).mean()**0.5
    mlflow.log_metric("rmse", rmse)

    print(f"Run ID: {run.info.run_id}, Test RMSE: {rmse}")

    # Register the model
    model_name = "rf_ames_model"
    model_uri = f"runs:/{run.info.run_id}/rf_model"
    registered_model = mlflow.register_model(model_uri, model_name)

    print(f"Model registered: {registered_model.name}, version: {registered_model.version}")

print(f"Deploying version {registered_model.version} to 'prod'...")

client.set_registered_model_alias(
    name=model_name,
    alias="prod",
    version=registered_model.version
)

print(f"âœ… SUCCESS: Model '{model_name}' (Version {registered_model.version}) is now deployed as '@prod'.")