# Notebook 4: Model Deployment

### AutoML from the previous notebook gave us the insight that a random forest regressor with the parameters inserted below is a strong modeling choice. We will go with that and deploy this model.

In [3]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import os

mlflow.set_experiment("ames_housing_experiment")


2025/11/30 16:34:34 INFO mlflow.tracking.fluent: Experiment with name 'ames_housing_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/iankidwell/Documents/Housing-Price-MLops/notebooks/mlruns/928281827655018339', creation_time=1764542074349, experiment_id='928281827655018339', last_update_time=1764542074349, lifecycle_stage='active', name='ames_housing_experiment', tags={}>

In [4]:
# The '..' goes up one level from 'notebooks/', then into 'data/'
input_path = "../data/ames_preprocessed_numeric_unscaled.csv"

# Load the CSV
df = pd.read_csv(input_path)

# Verification
print(f"Successfully loaded data from: {os.path.abspath(input_path)}")
print(df.head())

# Define target column
target = "SalePrice_log"

# Split features and target
X = df.drop(columns=[target])
y = df[target]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Quick check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


Successfully loaded data from: /Users/iankidwell/Documents/Housing-Price-MLops/data/ames_preprocessed_numeric_unscaled.csv
   Order        PID  MS SubClass  Lot Frontage  Lot Area  Overall Qual  \
0      1  526301100           20         141.0     31770             6   
1      2  526350040           20          80.0     11622             5   
2      3  526351010           20          81.0     14267             6   
3      4  526353030           20          93.0     11160             7   
4      5  527105010           60          74.0     13830             5   

   Overall Cond  Year Built  Year Remod/Add  Mas Vnr Area  ...  \
0             5        1960            1960         112.0  ...   
1             6        1961            1961           0.0  ...   
2             6        1958            1958         108.0  ...   
3             5        1968            1968           0.0  ...   
4             5        1997            1998           0.0  ...   

   Enclosed Porch  3Ssn Porch  Scre

In [5]:
# Use relative paths so it works on GitHub clones
test_features_path = "../data/X_test.csv"
test_target_path = "../data/y_test.csv"

# Save features and target separately
X_test.to_csv(test_features_path, index=False)
y_test.to_csv(test_target_path, index=False)

# Verification: Print the absolute path to confirm they landed in the right spot
print(f"Test features saved to: {os.path.abspath(test_features_path)}")
print(f"Test target saved to:   {os.path.abspath(test_target_path)}")

Test features saved to: /Users/iankidwell/Documents/Housing-Price-MLops/data/X_test.csv
Test target saved to:   /Users/iankidwell/Documents/Housing-Price-MLops/data/y_test.csv


In [None]:
from mlflow.models.signature import infer_signature
from sklearn.ensemble import RandomForestRegressor
import mlflow
import mlflow.sklearn

# Example using AutoML-best parameters
rf = RandomForestRegressor(
    max_features=0.9269,
    max_leaf_nodes=357,
    n_estimators=8,
    n_jobs=-1,
    random_state=42
)

# Fit on training data
rf.fit(X_train, y_train)

# Infer model signature based on training data and predictions
train_preds = rf.predict(X_train)
signature = infer_signature(X_train, train_preds)

# Start MLflow run
with mlflow.start_run(run_name="rf_ames_model") as run:
    # Log model with signature
    mlflow.sklearn.log_model(rf, "rf_model", signature=signature)
    
    # Log metrics
    test_preds = rf.predict(X_test)
    rmse = ((test_preds - y_test)**2).mean()**0.5
    mlflow.log_metric("rmse", rmse)

    print(f"Run ID: {run.info.run_id}, Test RMSE: {rmse}")

    # Register the model
    model_name = "rf_ames_model"
    model_uri = f"runs:/{run.info.run_id}/rf_model"
    registered_model = mlflow.register_model(model_uri, model_name)

    print(f"Model registered: {registered_model.name}, version: {registered_model.version}")




Run ID: a9960f2cb37f4c6c93b98d8768c2e469, Test RMSE: 0.006775868494260694


Registered model 'rf_ames_model' already exists. Creating a new version of this model...


Model registered: workspace.default.rf_ames_model, version: 1


Created version '1' of model 'workspace.default.rf_ames_model'.
