In [1]:
# ! pip install mlflow

In [2]:
import mlflow
import os

In [3]:
! mlflow --version

mlflow, version 2.13.0


In [4]:
TAXI_DATA_DIR = "taxi_data"
os.makedirs(TAXI_DATA_DIR, exist_ok=True)

### Downloading data

In [5]:
link_taxi_jan = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet"
link_taxi_feb = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet"
link_taxi_mar = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet"


download_links = [link_taxi_jan, link_taxi_feb, link_taxi_mar]


In [6]:

import subprocess

def download_parquet_files():
    for link in download_links:
        parquet_file_name = link.split("/")[-1]
        output_path = os.path.join(TAXI_DATA_DIR, parquet_file_name)
        if not os.path.exists(output_path):
            cmd = ['wget', '-O', output_path, link]
            result = subprocess.run(cmd)
            if result.returncode == 0:
                print(f"Downloaded {parquet_file_name}")
            else:
                print(f"Failed to download {parquet_file_name}: {result.stderr}")
        else:
            print(f"{parquet_file_name} already exists.")

download_parquet_files()

# Uncomment to download directly without loop
#! wget {link_taxi_jan} -P {TAXI_DATA_DIR}
#! wget {link_taxi_feb} -P {TAXI_DATA_DIR}
#! wget {link_taxi_mar} -P {TAXI_DATA_DIR}

green_tripdata_2023-01.parquet already exists.
green_tripdata_2023-02.parquet already exists.
green_tripdata_2023-03.parquet already exists.


In [7]:
! ls {TAXI_DATA_DIR}

green_tripdata_2023-01.parquet	green_tripdata_2023-03.parquet
green_tripdata_2023-02.parquet


### Running Preprocess script

In [8]:
! python homework/preprocess_data.py --raw_data_path {TAXI_DATA_DIR} --dest_path ./output

In [9]:
! ls output | wc -l
! ls output

4
dv.pkl	test.pkl  train.pkl  val.pkl


### Answer to Q2 : 4 files

## Q3: Train a model with autolog

In [10]:
from homework import train
from pathlib import Path

In [11]:
data_path = Path("./output")
os.listdir(data_path)

['val.pkl', 'test.pkl', 'dv.pkl', 'train.pkl']

### We have modified the train script

In [12]:
with mlflow.start_run():
    rmse, min_samples_split  = train.trainer(data_path=str(data_path))
    mlflow.log_metric("rmse", rmse)
    mlflow.log_param("min_samples_split", min_samples_split)




In [13]:
rmse

5.431162180141208

In [14]:
min_samples_split

2

### Answer to Q3 : min_samples_split is 2

## Q4. Launch the tracking server locally

In [15]:
# ! mlflow ui --backend-store-uri sqlite:///taxi_duration.db --default-artifact-root artifacts

In [16]:
# To kill mlflow server kill `lsof -i :5000`

### Answer to Q4 : default-artifact-root

## Q5. Tune model hyperparameters

In [17]:
# ! pip install hyperopt

In [18]:
! python homework/hpo.py

2024/05/24 17:18:46 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.















100%|██████████| 15/15 [01:00<00:00,  4.03s/trial, best loss: 5.335419588556921]


### Answer to Q5 : best RMSE = 5.335

## Q6. Promote the best model to the model registry

In [19]:
! python homework/register_model.py

2024/05/24 17:19:49 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.
Successfully registered model 'rf-duration-prediction'.
2024/05/24 17:20:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf-duration-prediction, version 1
Created version '1' of model 'rf-duration-prediction'.


### Answer to Q6 :  test RMSE of the best model is 5.567