In [1]:
!python -V

Python 3.11.7


In [2]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from mlflow import MlflowClient
from mlflow.entities import ViewType

TRACKING_URI = "sqlite:///mlflow.db"

## Question 1. Run Mage

In [32]:
!pwd

/workspaces/mlops-zoomcamp/03-orchestration


In [33]:
!mkdir homework_3

In [35]:
!ls -l

total 60
drwxrwxrwx+ 2 codespace codespace  4096 May 30 13:07 homework_3
-rw-rw-rw-  1 codespace codespace 56021 May 30 13:06 homework_3.ipynb


In [36]:
!wget https://github.com/mage-ai/mlops/archive/refs/heads/master.zip

--2024-05-30 13:08:01--  https://github.com/mage-ai/mlops/archive/refs/heads/master.zip
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/mage-ai/mlops/zip/refs/heads/master [following]
--2024-05-30 13:08:01--  https://codeload.github.com/mage-ai/mlops/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.112.9
Connecting to codeload.github.com (codeload.github.com)|140.82.112.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘master.zip’

master.zip              [ <=>                ]  93.72K  --.-KB/s    in 0.01s   

2024-05-30 13:08:01 (7.58 MB/s) - ‘master.zip’ saved [95967]



In [37]:
!mv master.zip homework_3

In [38]:
!ls -l homework_3

total 96
-rw-rw-rw- 1 codespace codespace 95967 May 30 13:08 master.zip


In [39]:
!unzip homework_3/master.zip

Archive:  homework_3/master.zip
64ab08ebe90022cdf742dfcddb634b1b5f296945
   creating: mlops-master/
  inflating: mlops-master/.env.dev   
  inflating: mlops-master/.gitignore  
  inflating: mlops-master/Dockerfile  
  inflating: mlops-master/README.md  
  inflating: mlops-master/docker-compose.yml  
   creating: mlops-master/mlops/
 extracting: mlops-master/mlops/__init__.py  
  inflating: mlops-master/mlops/design.yaml  
  inflating: mlops-master/mlops/metadata.yaml  
   creating: mlops-master/mlops/presenters/
   creating: mlops-master/mlops/presenters/overview/
   creating: mlops-master/mlops/presenters/overview/dashboard/
  inflating: mlops-master/mlops/presenters/overview/dashboard/block_layout.yaml  
   creating: mlops-master/mlops/presenters/pipelines/
   creating: mlops-master/mlops/presenters/pipelines/sklearn_training/
   creating: mlops-master/mlops/presenters/pipelines/sklearn_training/dashboard/
  inflating: mlops-master/mlops/presenters/pipelines/sklearn_training/dashboar

In [40]:
!rm -r homework_3

In [6]:
!ls -l

total 212
-rw-rw-rw-  1 codespace codespace 209855 May 30 14:38 homework_3.ipynb
drwxr-xr-x+ 9 codespace codespace   4096 Jun 10 02:58 mlops-master


### Run this command in the terminal
```
cd mlops-master
./scripts/start.sh
```

Go to URL http://127.0.0.1:6789


![image.png](attachment:38d7675f-9dfb-4cb6-aea6-50cf969730d0.png)

## Q2. Creating a project

In [7]:
!pwd

/workspaces/mlops-zoomcamp/03-orchestration


In [17]:
!ls -l 

total 208
-rw-rw-rw-  1 codespace codespace 208226 Jun 10 03:24 homework_3.ipynb
drwxr-xr-x+ 9 codespace codespace   4096 Jun 10 02:58 mlops-master


In [16]:
!ls -l mlops-master/mlops

total 56
-rw-r--r--   1 codespace codespace    0 May 29 14:22 __init__.py
drwxrwxrwx+  2 root      root      4096 May 31 14:16 __pycache__
-rw-r--r--   1 codespace codespace  160 May 29 14:22 design.yaml
drwxr-xr-x+ 16 root      root      4096 Jun  9 05:27 homework_03
-rw-r--r--   1 codespace codespace  479 May 30 13:23 metadata.yaml
drwxrwxrwx+  2 root      root      4096 May 30 13:23 pipelines
drwxr-xr-x+  4 codespace codespace 4096 May 29 14:22 presenters
-rwxr-xr-x   1 codespace codespace  100 May 29 14:22 requirements.txt
drwxr-xr-x+ 19 root      root      4096 Jun  6 14:26 rj92_data_preparation
-rw-r--r--   1 codespace codespace  202 Jun  9 04:51 settings.yaml
drwxr-xr-x+  7 codespace codespace 4096 May 30 13:23 unit_0_setup
drwxr-xr-x+ 14 root      root      4096 May 29 14:32 unit_1_data_preparation
drwxr-xr-x+ 12 codespace codespace 4096 May 29 14:22 unit_3_observability
drwxrwxrwx+  8 root      root      4096 Jun  4 16:10 utilidades
drwxr-xr-x+  8 codespace codespace 4096 Jun 

![image.png](attachment:d8ba89da-dcfe-4484-8c17-53002386bd96.png)

In [15]:
!ls -l mlops-master/mlops/homework_03/

total 60
-rwxr-xr-x  1 root root    0 May 29 14:31 __init__.py
drwxr-xr-x+ 2 root root 4096 May 29 14:32 __pycache__
drwxr-xr-x+ 3 root root 4096 May 29 14:32 charts
drwxr-xr-x+ 3 root root 4096 May 29 14:32 custom
drwxr-xr-x+ 3 root root 4096 Jun  9 19:05 data_exporters
drwxr-xr-x+ 3 root root 4096 Jun  9 05:00 data_loaders
drwxr-xr-x+ 2 root root 4096 May 29 14:31 dbt
drwxr-xr-x+ 3 root root 4096 May 29 14:32 extensions
drwxr-xr-x+ 3 root root 4096 May 29 14:32 interactions
-rwxr-xr-x  1 root root 4723 May 29 14:31 io_config.yaml
-rwxr-xr-x  1 root root 1970 Jun  9 04:49 metadata.yaml
drwxr-xr-x+ 5 root root 4096 Jun  9 05:01 pipelines
-rwxr-xr-x  1 root root    0 May 29 14:31 requirements.txt
drwxr-xr-x+ 3 root root 4096 May 29 14:32 scratchpads
drwxr-xr-x+ 3 root root 4096 Jun  9 18:21 transformers
drwxr-xr-x+ 3 root root 4096 May 29 14:32 utils


In [19]:
!wc -l mlops-master/mlops/homework_03/metadata.yaml

55 mlops-master/mlops/homework_03/metadata.yaml


## Q3. Creating a pipeline

![image.png](attachment:013054a1-fce0-4cbb-a1f8-bb808557d836.png)

![image.png](attachment:fc41ba89-7b17-4e9c-8ad3-64df91147ea6.png)

## Q4. Data preparation

![image.png](attachment:dbfcf823-7920-43e1-9ec9-8f6b32607eae.png)

![image.png](attachment:bb1a171b-1bcf-4f74-a85f-5a65348b9347.png)

## Q5. Tune model hyperparameters

In [23]:
runs = client.search_runs(
    experiment_ids='1',
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    #order_by=["metrics.training_root_mean_squared_error ASC"]
)

for run in runs:
    print(run)

<Run: data=<RunData: metrics={'training_mean_absolute_error': 3.4244701942312354,
 'training_mean_squared_error': 27.083054499499358,
 'training_r2_score': 0.6673983775155525,
 'training_root_mean_squared_error': 5.204138209108148,
 'training_score': 0.6673983775155525}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '10',
 'max_features': '1.0',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '1',
 'min_samples_split': '2',
 'min_weight_fraction_leaf': '0.0',
 'monotonic_cst': 'None',
 'n_estimators': '100',
 'n_jobs': 'None',
 'oob_score': 'False',
 'random_state': '0',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegressor',
 'mlflow.log-model.history': '[{"run_id": "195568ec67154979b1b1200431ca6d4e", '
                             '"artifact_path": "model", "utc_time_created":

In [28]:
client = MlflowClient(tracking_uri=TRACKING_URI)

all_experiments = client.search_experiments()

for experiments in all_experiments:
    print(experiments, end="\n\n")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/artifacts/2', creation_time=1716910749068, experiment_id='2', last_update_time=1716910749068, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1716910319322, experiment_id='1', last_update_time=1716910319322, lifecycle_stage='active', name='homework_2', tags={}>

<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1716910191707, experiment_id='0', last_update_time=1716910191707, lifecycle_stage='active', name='Default', tags={}>



In [31]:
runs = client.search_runs(
    experiment_ids='2',
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=4,
    order_by=["metrics.rmse ASC"]
)

In [32]:
for run in runs:
    print(f"run id: {run.info.run_id}, run name: {run.info.run_name}\n\trmse: {run.data.metrics['rmse']:.4f}\n\tparams: {run.data.params}", end="\n\n")

run id: 23eea0670bee43368624f68474525578, run name: stylish-cat-847
	rmse: 5.3354
	params: {'max_depth': '19', 'min_samples_leaf': '2', 'min_samples_split': '2', 'n_estimators': '11', 'random_state': '42'}

run id: da524a7ac3f648ddb620dec4ef792664, run name: gaudy-gnu-606
	rmse: 5.3547
	params: {'max_depth': '15', 'min_samples_leaf': '2', 'min_samples_split': '3', 'n_estimators': '40', 'random_state': '42'}

run id: e4a8dc0a7df84bd99e1cb0ea15997aaf, run name: carefree-duck-743
	rmse: 5.3550
	params: {'max_depth': '20', 'min_samples_leaf': '1', 'min_samples_split': '9', 'n_estimators': '19', 'random_state': '42'}

run id: 7b69efd69a174404ab117490f0d09d24, run name: carefree-bat-43
	rmse: 5.3575
	params: {'max_depth': '14', 'min_samples_leaf': '3', 'min_samples_split': '4', 'n_estimators': '26', 'random_state': '42'}



## Q6. Promote the best model to the model registry

In [36]:
client = MlflowClient(tracking_uri=TRACKING_URI)

all_experiments = client.search_experiments()

for experiments in all_experiments:
    print(experiments, end="\n\n")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/artifacts/3', creation_time=1716911118470, experiment_id='3', last_update_time=1716911118470, lifecycle_stage='active', name='random-forest-best-models', tags={}>

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/artifacts/2', creation_time=1716910749068, experiment_id='2', last_update_time=1716910749068, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1716910319322, experiment_id='1', last_update_time=1716910319322, lifecycle_stage='active', name='homework_2', tags={}>

<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1716910191707, experiment_id='0', last_update_time=1716910191707, lifecycle_stage='active', name='Default', tags={}>



In [37]:
best_runs = client.search_runs(
    experiment_ids='3',
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.test_rmse ASC"]
)

for run in best_runs:
    print(f"test_rmse: {run.data.metrics['test_rmse']:.4f}, run id: {run.info.run_id}, run name: {run.info.run_name}")

test_rmse: 5.5674, run id: 07ac4b9435504737bf4407e3df0603ec, run name: nimble-donkey-780
test_rmse: 5.5853, run id: bc0aa0aa834d456093d7408a6952f99a, run name: selective-rook-571
test_rmse: 5.5895, run id: ecd6a9569b7d4d9480ae9e92a377ab78, run name: invincible-yak-504
test_rmse: 5.5921, run id: 0810eeedda184f2a88b411b6061ea736, run name: bold-shrimp-931
test_rmse: 5.5942, run id: 37e5afc2f4cb4442b1b8d99cb69e0f4e, run name: stately-squid-757


In [47]:
model_name = 'ml_regressor_green_taxi_trip'
versions = client.search_model_versions(f"name='{model_name}'")

for version in versions:
    print(f"version: {version.version} \n\tstage: {version.current_stage}, source {version.source}, rund id {version.run_id}")

version: 1 
	stage: None, source /workspaces/mlops-zoomcamp/02-experiment-tracking/artifacts/3/07ac4b9435504737bf4407e3df0603ec/artifacts/model, rund id 07ac4b9435504737bf4407e3df0603ec
