# Start

In [1]:
from IPython import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%pwd
%cd drive/My Drive/Works/X-ops/ml-ops/zCamp/monitor_evidently
!python --version

'/content'

/content/drive/My Drive/Works/X-ops/ml-ops/zCamp/monitor_evidently
Python 3.10.12


# 💻 Installation - Environment

In [None]:
# Install Evidently in Colab
!pip install evidently -q

In [9]:
# Needed Libraries
import requests
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric, ColumnQuantileMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# EDA to Monitor-5!!

* [Data Source](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
* Ref1:

In [6]:
# Download the March 2024 Green Taxi data - #Q1 - Ans
df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet")
df.shape

(57457, 20)

In [7]:
# create target & filter outliers
df["duration_min"] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
df.duration_min = df.duration_min.apply(lambda td : float(td.total_seconds())/60)

df = df[(df.duration_min >= 0) & (df.duration_min <= 60)]
df = df[(df.passenger_count > 0) & (df.passenger_count <= 8)]
df.shape

(54135, 21)

In [8]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [None]:
# Split data


In [16]:
# Evidently Metrics
report = Report(metrics=[
    ColumnQuantileMetric(column_name='fare_amount', quantile=0.5)
]
)

In [None]:
# Check Reports
report.run(reference_data = df[:30000], current_data = df)
report.show(mode='inline')

## Useful Code Chunks

In [None]:
# Save files with url/names into destination folder - DE1
files = [('green_tripdata_2022-02.parquet', './data'), ('green_tripdata_2022-01.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

In [None]:
# Running Pre-Exisiting files: Q2-Ans
!python preprocess_data.py --raw_data_path '/content/drive/My Drive/Works/X-ops/ml-ops/zCamp/mlflow_colab' --dest_path ./output

In [None]:
# Running a model without HyperOpt - BASE MODEL: Q3-Ans
!python train.py --data_path '/content/drive/My Drive/Works/X-ops/ml-ops/zCamp/mlflow_colab/output'

RMSE:  5.431162180141208


In [None]:
# Running a model with HyperOpt and get Best Params:
!python hpo.py

100% 15/15 [01:41<00:00,  6.76s/trial, best loss: 5.335419588556921]
Best_Params:  {'max_depth': 19.0, 'min_samples_leaf': 2.0, 'min_samples_split': 2.0, 'n_estimators': 11.0}


In [None]:
# Test Model:
!python test.py

RMSE:  5.567408012462019


In [None]:
# Lets use Data Bricks : https://community.cloud.databricks.com/
# Username: 2021fc04004@wilp.bits-pilani.ac.in
import mlflow
mlflow.login()

2024/05/29 13:40:16 INFO mlflow.utils.credentials: No valid Databricks credentials found, please enter your credentials...


Databricks Host (should begin with https://): https://community.cloud.databricks.com/
Username: 2021fc04004@wilp.bits-pilani.ac.in
Password: ··········


2024/05/29 13:42:10 INFO mlflow.utils.credentials: Successfully connected to MLflow hosted tracking server! Host: https://community.cloud.databricks.com.


In [None]:
# Mlflow in Data Bricks from Colab!
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/2021fc04004@wilp.bits-pilani.ac.in/zCamp02HW")

In [None]:
# Running Pre-Trained model with HyperOpt and Tracking the Runs in mlfow on "Databricks":
!python hpo1.py # Q5-Ans

100% 15/15 [01:49<00:00,  7.33s/trial, best loss: 5.335419588556921]


In [None]:
# Register the Models and Select Best Model on Test Data (March-2023 data):
!python register_model1.py # 'max_depth' KeyError