# Notebook 4: Model Deployment

### AutoML from the previous notebook gave us the insight that a random forest regressor with the parameters inserted below is a strong modeling choice. We will go with that and deploy this model.

In [0]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

mlflow.set_experiment("/Users/iakidwell@uchicago.edu/ames_housing_experiment")


<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/4007138355986532', creation_time=1763573445696, experiment_id='4007138355986532', last_update_time=1763578712016, lifecycle_stage='active', name='/Users/iakidwell@uchicago.edu/ames_housing_experiment', tags={'mlflow.experiment.sourceName': '/Users/iakidwell@uchicago.edu/ames_housing_experiment',
 'mlflow.experimentKind': 'custom_model_development',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'iakidwell@uchicago.edu',
 'mlflow.ownerId': '70831446628778'}>

In [0]:
df = pd.read_csv("/Workspace/Users/iakidwell@uchicago.edu/ames_preprocessed_numeric_unscaled.csv") 

# Define your target column
target = "SalePrice_log"

# Split features and target
X = df.drop(columns=[target])
y = df[target]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Quick check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(2344, 53) (586, 53) (2344,) (586,)


In [0]:

test_features_path = "/Workspace/Users/iakidwell@uchicago.edu/X_test.csv"
test_target_path = "/Workspace/Users/iakidwell@uchicago.edu/y_test.csv"

# Save features and target separately
X_test.to_csv(test_features_path, index=False)
y_test.to_csv(test_target_path, index=False)

print(f"Test features saved to: {test_features_path}")
print(f"Test target saved to: {test_target_path}")

Test features saved to: /Workspace/Users/iakidwell@uchicago.edu/X_test.csv
Test target saved to: /Workspace/Users/iakidwell@uchicago.edu/y_test.csv


In [0]:
from mlflow.models.signature import infer_signature
from sklearn.ensemble import RandomForestRegressor
import mlflow
import mlflow.sklearn

# Example using AutoML-best parameters
rf = RandomForestRegressor(
    max_features=0.9269,
    max_leaf_nodes=357,
    n_estimators=8,
    n_jobs=-1,
    random_state=42
)

# Fit on training data
rf.fit(X_train, y_train)

# Infer model signature based on training data and predictions
train_preds = rf.predict(X_train)
signature = infer_signature(X_train, train_preds)

# Start MLflow run
with mlflow.start_run(run_name="rf_ames_model") as run:
    # Log model with signature
    mlflow.sklearn.log_model(rf, "rf_model", signature=signature)
    
    # Log metrics
    test_preds = rf.predict(X_test)
    rmse = ((test_preds - y_test)**2).mean()**0.5
    mlflow.log_metric("rmse", rmse)

    print(f"Run ID: {run.info.run_id}, Test RMSE: {rmse}")

    # Register the model
    model_name = "rf_ames_model"
    model_uri = f"runs:/{run.info.run_id}/rf_model"
    registered_model = mlflow.register_model(model_uri, model_name)

    print(f"Model registered: {registered_model.name}, version: {registered_model.version}")




Run ID: a9960f2cb37f4c6c93b98d8768c2e469, Test RMSE: 0.006775868494260694


Registered model 'rf_ames_model' already exists. Creating a new version of this model...


Model registered: workspace.default.rf_ames_model, version: 1


Created version '1' of model 'workspace.default.rf_ames_model'.
