In [1]:
# Q1 
!mlflow --version


mlflow, version 3.1.0


In [None]:
# Q2 
!python preprocess_data.py --raw_data_path ./data --dest_path ./output
# 4 files 

In [1]:
# Q3 
import os
import pickle
import click

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


In [3]:
import mlflow 

mlflow.set_tracking_uri("sqlite:///mlflow.db")# set tracking url 
mlflow.set_experiment("homework") # create the experiment 

2025/07/13 03:09:02 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/13 03:09:02 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2025/07/13 03:09:02 INFO mlflow.tracking.fluent: Experiment with name 'homework' does not exist. Creating a new experiment.


<Experiment: artifact_location='/workspaces/mlopszoomcamp/week2/mlruns/4', creation_time=1752376142321, experiment_id='4', last_update_time=1752376142321, lifecycle_stage='active', name='homework', tags={}>

In [None]:
def run_train(data_path: str):
    mlflow.autolog()
    with mlflow.start_run():
        X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
        X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = root_mean_squared_error(y_val, y_pred)


run_train("./output")
# min_samples_split = 2 

2025/07/13 03:09:03 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [1]:
# Q4 
!mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts_local

# default-artifact-root

2025/07/13 03:24:40 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/13 03:24:40 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2025/07/13 03:24:40 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/13 03:24:40 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
[2025-07-13 03:24:40 +0000] [14657] [INFO] Starting gunicorn 23.0.0
[2025-07-13 03:24:40 +0000] [14657] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2025-07-13 03:24:40 +0000] [14657] [ERROR] connection to ('127.0.0.1', 5000) failed: [Errno 98] Address already in use
[2025-07-13 03:24

In [4]:
# Q5 
import os
import pickle
import click
import mlflow
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("random-forest-hyperopt")


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

def run_optimization(data_path: str, num_trials: int):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    
    # Add memory debugging
    print(f"X_train shape: {X_train.shape}")
    print(f"X_train memory usage: {X_train.data.nbytes / (1024**2):.2f} MB")
    print(f"y_train shape: {y_train.shape}")
    
    def objective(params):
        try:
            with mlflow.start_run():
                mlflow.log_params(params)
                
                # Add memory monitoring
                import psutil
                process = psutil.Process(os.getpid())
                memory_before = process.memory_info().rss / (1024**2)
                print(f"Memory before RF: {memory_before:.2f} MB")
                
                rf = RandomForestRegressor(**params, n_jobs=1)  # Limit parallel jobs
                rf.fit(X_train, y_train)
                
                memory_after = process.memory_info().rss / (1024**2)
                print(f"Memory after RF: {memory_after:.2f} MB")
                
                y_pred = rf.predict(X_val)
                rmse = root_mean_squared_error(y_val, y_pred)
                mlflow.log_metric("rmse", rmse)
                
                # Clean up
                del rf
                import gc
                gc.collect()
                
                return {'loss': rmse, 'status': STATUS_OK}
        except Exception as e:
            print(f"Error in objective: {e}")
            return {'loss': float('inf'), 'status': 'Not ok'}
    
    # Reduce search space for testing
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 10, 1)),  # Reduced from 20
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 20, 1)),  # Reduced from 50
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }
    
    rstate = np.random.default_rng(42)
    
    # Start with fewer trials for testing
    trials = Trials()
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,  # Start with 5 trials
        trials=trials,
        rstate=rstate
    )


In [None]:
run_optimization("./output",15)
# best rmse: 5.4138

X_train shape: (65946, 5702)
X_train memory usage: 1.01 MB
y_train shape: (65946,)
Memory before RF: 263.31 MB                           
Memory after RF: 263.44 MB                            
🏃 View run nervous-goose-659 at: http://127.0.0.1:5000/#/experiments/1/runs/cfa3e665b5bd44459a188abe23862227

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1

Memory before RF: 263.44 MB                                                    
Memory after RF: 263.44 MB                                                     
🏃 View run aged-kit-233 at: http://127.0.0.1:5000/#/experiments/1/runs/e657fc54ade74c6c943341ce0d9d04bf

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                   

Memory before RF: 263.44 MB                                                    
Memory after RF: 263.44 MB                                                     
🏃 View run incongruous-pig-963 at: http://127.0.0.1:5000/#/experiments/1/runs/95ab3299e76840ae8e7c19189e61c076

🧪 View experiment a

In [1]:
# Q6 
import os
import pickle
import click
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

def train_and_log_model(data_path, params):
    # Add this line to limit memory usage
    import gc
    
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        new_params = {}
        for param in RF_PARAMS:
            if param in params:  # Add this check
                try:
                    new_params[param] = int(params[param])
                except:
                    new_params[param] = params[param]

        # Add n_jobs=1 to limit CPU usage
        rf = RandomForestRegressor(**new_params, n_jobs=1)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = root_mean_squared_error(y_val, rf.predict(X_val))
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = root_mean_squared_error(y_test, rf.predict(X_test))
        mlflow.log_metric("test_rmse", test_rmse)
        mlflow.log_metric("rmse", test_rmse)  # Add this for sorting
        
        # Log the model (you might be missing this)
        mlflow.sklearn.log_model(rf, "model")
        
        # Clean up
        del rf
        gc.collect()



In [7]:
def run_register_model(data_path: str, top_n: int):

    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params)

    # Select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.rmse ASC"]
    )[0]

    print(f"run id: {best_run.info.run_id}, rmse: {best_run.data.metrics['rmse']:.4f}")

    # Register the best model
    model_uri = f"runs:/{best_run.info.run_id}/model"
    mlflow.register_model(model_uri=model_uri, name="nyc-taxi-model")



In [8]:
run_register_model("./output",5)

: 