In [2]:
import os
import uuid
import pickle
from typing import Union, Tuple, List, Dict
import numpy as np


import pandas as pd

import mlflow

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline

In [3]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS']="/home/habeeb/dprof-dezoomfinal-b4d188529d18.json"

TRACKING_SERVER_HOST = "35.224.212.79" # fill in with the public IP
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

model_name = "nyc-taxi-regressor-weighted-main9"
model = mlflow.pyfunc.load_model(f"models:/{model_name}@production")

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [4]:
def generate_uuids(n):
    return [str(uuid.uuid4()) for i in range(n)] 

def download_data(taxi = "green", year = 2021, month = 3):
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{taxi}_tripdata_{year}-{month:02d}.parquet"        
    print(f"Downloading data from {url}")
    ride = pd.read_parquet(url)
    return ride

In [5]:
def prepare_df(data: Union[pd.DataFrame, List[Dict], Dict]) -> Tuple[pd.DataFrame, Union[np.ndarray, None]]:
    """
    Prepare taxi trip data for model input.
    
    Supports:
      - DataFrame input
      - List of dictionaries
      - Single dictionary input
    
    Handles both lpep and tpep datetime columns.
    
    Args:
        data: Input data
    
    Returns:
        Tuple of (processed DataFrame, target if available else None)
    """
    # Normalize input to DataFrame
    if isinstance(data, dict):
        df = pd.DataFrame([data])
    elif isinstance(data, list):
        df = pd.DataFrame(data)
    elif isinstance(data, pd.DataFrame):
        df = data.copy()
    else:
        raise ValueError("Input must be a DataFrame, list of dicts, or single dict.")

    # Determine pickup/dropoff column names
    if 'lpep_pickup_datetime' in df.columns and 'lpep_dropoff_datetime' in df.columns:
        pickup_col, dropoff_col = 'lpep_pickup_datetime', 'lpep_dropoff_datetime'
    elif 'tpep_pickup_datetime' in df.columns and 'tpep_dropoff_datetime' in df.columns:
        pickup_col, dropoff_col = 'tpep_pickup_datetime', 'tpep_dropoff_datetime'
    else:
        pickup_col, dropoff_col = None, None

    # Handle datetime conversion and duration calculation
    if pickup_col and dropoff_col:
        if not np.issubdtype(df[pickup_col].dtype, np.datetime64):
            df[pickup_col] = pd.to_datetime(df[pickup_col])
        if not np.issubdtype(df[dropoff_col].dtype, np.datetime64):
            df[dropoff_col] = pd.to_datetime(df[dropoff_col])

        df["duration"] = (df[dropoff_col] - df[pickup_col]).dt.total_seconds() / 60
        df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]
    else:
        df["duration"] = None

    # Convert categorical columns to string if present
    for col in ["PULocationID", "DOLocationID"]:
        if col in df.columns:
            df[col] = df[col].astype(str)

    # Create combined PU_DO feature
    if "PULocationID" in df.columns and "DOLocationID" in df.columns:
        df["PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]

    df['ride_id'] = generate_uuids(len(df))

    # Return target if fully computed
    target = df["duration"].values if df["duration"].notna().all() else None
    return df, target

def predict(features):
    preds = model.predict(features)
    return preds

In [8]:
def apply_model(taxi: str = "green", year: int = 2021, month: int = 2, verbose: bool = True):
    if verbose:
        print(f"▶️ Starting model application for {taxi} taxi - {year:04d}-{month:02d}")

    # Download and prepare data
    df = download_data(taxi=taxi, year=year, month=month)
    dfs, targ = prepare_df(df)

    if verbose:
        print("📦 Data downloaded and prepared")

    # Make predictions
    y_pred = model.predict(dfs)

    # Construct results DataFrame
    df_result = pd.DataFrame({
        'ride_id': dfs['ride_id'],
        'lpep_pickup_datetime': dfs['lpep_pickup_datetime'],
        'PULocationID': dfs['PULocationID'],
        'DOLocationID': dfs['DOLocationID'],
        'actual_duration': targ,
        'predicted_duration': y_pred,
        'diff': targ - y_pred,
        'model_version': model.metadata.run_id
    })

    # Define and create output directory
    output_file = f'output/{taxi}/{year:04d}-{month:02d}.parquet'
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # Save to Parquet
    df_result.to_parquet(output_file, index=False)

    if verbose:
        print(f"✅ Results saved to {output_file}")


In [9]:
apply_model(taxi="green", year=2021, month=2, verbose=True)

▶️ Starting model application for green taxi - 2021-02
Downloading data from https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet
📦 Data downloaded and prepared
✅ Results saved to output/green/2021-02.parquet


In [11]:
!ls output/green/

2021-02.parquet


In [12]:
pd.read_parquet("output/green/2021-02.parquet")

Unnamed: 0,ride_id,lpep_pickup_datetime,PULocationID,DOLocationID,actual_duration,predicted_duration,diff,model_version
0,8d77d765-fd95-4811-a726-88cc1cdd2130,2021-02-01 00:34:03,130,205,17.916667,16.572079,1.344588,131087963e1f42d682b3dd1aff9230cf
1,653033b7-2049-4fe1-a431-5a24b18ea5e4,2021-02-01 00:04:00,152,244,6.500000,7.200581,-0.700581,131087963e1f42d682b3dd1aff9230cf
2,ee0bc7f7-a09c-4791-818d-d434dcc6a28b,2021-02-01 00:18:51,152,48,15.250000,17.245272,-1.995272,131087963e1f42d682b3dd1aff9230cf
3,29ddcfe3-02c0-4b3e-a1d6-29960134abd4,2021-02-01 00:53:27,152,241,18.233333,24.199507,-5.966173,131087963e1f42d682b3dd1aff9230cf
4,25fbf825-c4bf-4a10-8d1a-66e256b25a07,2021-02-01 00:57:46,75,42,8.966667,10.122248,-1.155581,131087963e1f42d682b3dd1aff9230cf
...,...,...,...,...,...,...,...,...
61916,aa1467e7-b9e5-4d00-bffd-edb309059c00,2021-02-28 22:19:00,129,7,10.000000,13.721963,-3.721963,131087963e1f42d682b3dd1aff9230cf
61917,e99c5e2b-d197-4bc7-8e30-119047a67d90,2021-02-28 23:18:00,116,166,9.000000,10.243826,-1.243826,131087963e1f42d682b3dd1aff9230cf
61918,9a735180-982d-4820-9a3d-42edc4cae53f,2021-02-28 23:44:00,74,151,14.000000,13.407579,0.592421,131087963e1f42d682b3dd1aff9230cf
61919,0c66891a-fd85-4128-a4f6-f61d487fd54e,2021-02-28 23:07:00,42,42,7.000000,6.543267,0.456733,131087963e1f42d682b3dd1aff9230cf
