In [2]:
!python -V

Python 3.9.23


In [3]:
import pandas as pd

In [4]:
import pickle

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error

In [6]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("nyc-taxi-experiment-hmwrk")



<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1760640144799, experiment_id='2', last_update_time=1760640144799, lifecycle_stage='active', name='nyc-taxi-experiment-hmwrk', tags={}>

In [7]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [8]:
print(len(df))

3403766


In [9]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [10]:
df_x_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [11]:
print(len(df_x_train))

3316216


In [13]:
from sklearn.linear_model import LinearRegression

# Assuming df_x_train is already prepared and has:
# ['PULocationID', 'DOLocationID', 'duration'] columns
categorical = ['PULocationID', 'DOLocationID']
target = 'duration'

# Convert categorical features to dicts for DictVectorizer
train_dicts = df_x_train[categorical].to_dict(orient='records')

# --- Vectorize ---
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df_x_train[target].values

# --- Train Linear Regression model ---
lr = LinearRegression()
lr.fit(X_train, y_train)

# Print the intercept
print(f"Model intercept: {lr.intercept_}")


Model intercept: 24.77203445209766


In [18]:
from sklearn.metrics import mean_squared_error
import numpy as np

y_pred = lr.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))

In [19]:
import xgboost as xgb

In [20]:
from pathlib import Path

In [21]:
models_folder = Path('models')
models_folder.mkdir(exist_ok=True)

In [22]:
with open(models_folder / "preprocessor.b", "wb") as f_out:
    pickle.dump(dv, f_out)

In [23]:
with mlflow.start_run():
    # Log a metric
    mlflow.log_metric("rmse", rmse)
    
    # Log DictVectorizer as artifact
    mlflow.log_artifact(str(models_folder / "preprocessor.b"), artifact_path="preprocessor")
    
    # Log the linear regression model
    mlflow.sklearn.log_model(
        sk_model=lr,
        artifact_path="models_mlflow"
    )



🏃 View run bemused-sloth-524 at: http://localhost:5000/#/experiments/2/runs/14e12949b1934dfd8204499b724c1ba6
🧪 View experiment at: http://localhost:5000/#/experiments/2


In [29]:
import mlflow

run_id = "14e12949b1934dfd8204499b724c1ba6"
local_path = mlflow.artifacts.download_artifacts(
    run_id=run_id,
    artifact_path="models_mlflow/MLmodel"
)

with open(local_path, "r") as f:
    mlmodel_content = f.read()

print(mlmodel_content)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 