In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.2.2


In [9]:
import pickle
import pandas as pd
import numpy

In [3]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [4]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [5]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet')

In [6]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [10]:
df['predicted_duration'] = y_pred

In [12]:
df.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration,predicted_duration
count,2918187.0,2822437.0,2918187.0,2822437.0,2918187.0,2918187.0,2918187.0,2918187.0,2918187.0,2918187.0,2918187.0,2918187.0,2822437.0,2822437.0,2918187.0,2918187.0
mean,1.702265,1.39166,5.747751,1.350555,1.168508,12.68422,1.021348,0.4942594,2.463432,0.3529895,0.2975203,19.00824,2.325968,0.06890632,13.79372,12.51342
std,0.4693961,0.9841901,643.2019,5.562606,0.4829941,10.64428,1.224293,0.0698627,2.596137,1.592588,0.03836547,13.46242,0.672831,0.2865852,9.643098,5.281404
min,1.0,0.0,0.0,1.0,0.0,-600.0,-4.5,-0.5,-188.0,-23.0,-0.3,-600.3,-2.5,-1.25,1.0,-7.485578
25%,1.0,1.0,1.1,1.0,1.0,6.5,0.0,0.5,1.0,0.0,0.3,11.8,2.5,0.0,7.033333,9.945231
50%,2.0,1.0,1.8,1.0,1.0,9.5,0.5,0.5,2.06,0.0,0.3,15.18,2.5,0.0,11.26667,10.66465
75%,2.0,1.0,3.15,1.0,1.0,14.5,2.5,0.5,3.09,0.0,0.3,20.75,2.5,0.0,17.66667,12.1328
max,6.0,9.0,348798.5,99.0,5.0,650.0,10.3,3.3,380.8,95.0,0.3,650.3,2.75,1.25,60.0,80.13667


In [13]:
month = 2
year = 2022
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [15]:
result_df = df[['ride_id', 'predicted_duration']]

In [17]:
output_file = f"results_{year:04d}-{month:02d}"
result_df.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [18]:
!ls -l --block-size=M

total 58M
-rw-rw-r-- 1 inris inris  1M Jun 21 20:37 Dockerfile
-rw-rw-r-- 1 inris inris  1M Jun 27 13:07 Pipfile
-rw-r--r-- 1 inris inris  1M Jun 27 13:05 Pipfile.lock
-rw-rw-r-- 1 inris inris  1M Jun 21 20:37 model.bin
-rw-rw-r-- 1 inris inris 58M Jun 27 13:27 results_2022-02
-rw-rw-r-- 1 inris inris  1M Jun 27 13:28 starter.ipynb


In [20]:
!jupyter nbconvert --to script starter.ipynb

[NbConvertApp] Converting notebook starter.ipynb to script
[NbConvertApp] Writing 1459 bytes to starter.py
