In [1]:
import pickle
import pandas as pd

In [2]:
year = 2022
month = 2
filename = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
categorical_columns = ['PULocationID', 'DOLocationID']
model_path = "model.bin"

In [3]:
def load_model(path: str) -> tuple:
    
    with open(path, 'rb') as f_in:
        dv, model = pickle.load(f_in)
    return dv, model

def read_data(filename: str, categorical_columns: list) -> pd.DataFrame:
    
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical_columns] = df[categorical_columns].fillna(-1).astype('int').astype('str')
    
    return df

def model_prediction(dict_vectorizer, model, dataframe, categorical_columns):

    dicts = dataframe[categorical_columns].to_dict(orient='records')
    X_val = dict_vectorizer.transform(dicts)
    y_pred = model.predict(X_val)

    return y_pred

def save_results(dataframe, predictions, year, month) -> None:

    dataframe["ride_id"] = f"{year:04d}/{month:02d}" + dataframe.index.astype("str")
    dataframe["predictions"] = predictions.copy()
    df_result = dataframe[["ride_id", "predictions"]].copy()
    output_file = f"predictions_yellow_{year:04d}_{month:02d}.parquet"
    df_result.to_parquet(output_file, engine="pyarrow", compression=None, index=False)

In [4]:
dv, model = load_model(path=model_path)
df = read_data(filename=filename, categorical_columns=categorical_columns)
y_pred = model_prediction(dict_vectorizer=dv, model=model, dataframe=df, categorical_columns=categorical_columns)
save_results(dataframe=df, predictions=y_pred, year=year, month=month)