In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

In [2]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1749335674203, experiment_id='1', last_update_time=1749335674203, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:

df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

print("total records:", len(df))

total records: 3403766


In [4]:
# Calculate duration in minutes
df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60

# Use the query() method to filter the DataFrame in a more memory-efficient way
df_filtered = df.query('duration >= 1 and duration < 60').copy()

total_rows = df_filtered.shape[0]

print(f"Total rows after removing outliers: {total_rows}")

Total rows after removing outliers: 3316138


In [5]:
df_sample = df_filtered.sample(n=1000000, random_state=42)

In [6]:
categorical = ['PULocationID', 'DOLocationID']

df_sample[categorical] = df_sample[categorical].astype(str)

In [7]:
train_dicts = df_sample[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df_sample['duration'].values

In [8]:
with mlflow.start_run():

    # Define and train the model
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = lr.predict(X_train)
    rmse = root_mean_squared_error(y_train, y_pred)

    # Log params and metrics
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("rmse", rmse)
    mlflow.log_param("num_samples", len(df_sample))

    # Save DictVectorizer
    with open("preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("preprocessor.b", artifact_path="preprocessor")

    # Log the model with MLflow
    mlflow.sklearn.log_model(lr, artifact_path="models_mlflow")

    print(f'\nThe intercept of the model is: {lr.intercept_}')




The intercept of the model is: 24.956777553604446
🏃 View run overjoyed-skink-317 at: http://localhost:5000/#/experiments/1/runs/53c8061a3d6a448a8b74d2eb5cc5a3dd
🧪 View experiment at: http://localhost:5000/#/experiments/1
