In [1]:
import pandas as pd
import os
import mlflow
import yaml

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("nyc-taxi-experiment")
categorical = ['PULocationID', 'DOLocationID']

In [3]:
df = pd.read_parquet("./data/yellow_tripdata_2023-03.parquet")
num_rows = len(df)
print(num_rows)

3403766


In [4]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    df[categorical] = df[categorical].astype(str)

    return df

In [5]:
df_train = read_dataframe("./data/yellow_tripdata_2023-03.parquet")
num_rows = len(df_train)
print(num_rows)

3316216


In [6]:
categorical = ['PULocationID', 'DOLocationID']
df_train[categorical] = df_train[categorical].astype('str')
train_dicts = df_train[categorical].to_dict(orient='records')

dv = DictVectorizer()

X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df_train[target].values

with mlflow.start_run() as run:
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    mlflow.sklearn.log_model(lr, artifact_path="model")

    y_pred = lr.predict(X_train)
    mse = mean_squared_error(y_train, y_pred, squared=False)
    print(lr.intercept_)

    model_uri = f"runs:/{run.info.run_id}/model"
    mlflow.register_model(model_uri=model_uri, name="Homework_3")


Successfully registered model 'Homework_3'.
2025/10/31 15:10:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Homework_3, version 1
Created version '1' of model 'Homework_3'.
2025/10/31 15:10:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run zealous-wasp-926 at: http://localhost:5000/#/experiments/1/runs/660fc0dcdcc1438a93810b9f8b63d864.
2025/10/31 15:10:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


24.774803905297286


In [7]:
local_path = mlflow.artifacts.download_artifacts(model_uri)
mlmodel_path = os.path.join(local_path, "MLmodel")

with open(mlmodel_path, "r") as f:
    mlmodel_data = yaml.safe_load(f)

model_size = mlmodel_data.get("model_size_bytes", None)
print("Model size (bytes):", model_size)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Model size (bytes): 4500
