In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.4.1.post1


In [2]:
!python -V

Python 3.10.13


In [11]:
import pickle
import pandas as pd
import numpy as np

In [4]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [6]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [8]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,8.6,1.0,0.5,0.0,0.0,1.0,11.1,0.0,0.0,10.0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,52.7,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,18.4,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.0,14.366667
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,15.6,3.5,0.5,4.1,0.0,1.0,24.7,2.5,0.0,11.466667
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.2,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0,3.033333


In [9]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [10]:
y_pred

array([16.24590642, 26.1347962 , 11.88426424, ..., 11.59533603,
       13.11317847, 12.89999218])

In [12]:
np.std(y_pred)

6.247488852238703

In [14]:
df.tail()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
3403761,2,2023-03-31 23:24:25,2023-03-31 23:40:54,,3.16,,,163,75,0,12.13,0.0,0.5,4.23,0.0,1.0,20.36,,,16.483333
3403762,2,2023-03-31 23:24:50,2023-04-01 00:04:12,,6.89,,,125,198,0,40.92,0.0,0.5,8.98,0.0,1.0,53.9,,,39.366667
3403763,2,2023-03-31 23:26:31,2023-03-31 23:49:39,,4.01,,,50,224,0,24.02,0.0,0.5,0.0,0.0,1.0,28.02,,,23.133333
3403764,2,2023-03-31 23:07:51,2023-03-31 23:15:56,,1.31,,,113,158,0,8.51,0.0,0.5,3.5,0.0,1.0,16.01,,,8.083333
3403765,2,2023-03-31 23:26:12,2023-03-31 23:31:47,,0.88,,,41,166,0,13.51,0.0,0.5,2.25,0.0,1.0,17.26,,,5.583333


In [18]:
year = 2023
month = 3
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str') 

In [21]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration,ride_id,prediction
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,...,0.5,0.0,0.0,1.0,11.1,0.0,0.0,10.0,2023/03_0,16.245906
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,...,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333,2023/03_1,26.134796
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,...,0.5,4.65,0.0,1.0,28.05,2.5,0.0,14.366667,2023/03_2,11.884264
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,...,0.5,4.1,0.0,1.0,24.7,2.5,0.0,11.466667,2023/03_3,11.99772
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,...,0.5,2.44,0.0,1.0,14.64,2.5,0.0,3.033333,2023/03_4,10.234486


In [20]:
df['prediction'] = y_pred

In [25]:
df_result = df[['ride_id', 'prediction']]

In [26]:
df_result.head()

Unnamed: 0,ride_id,prediction
0,2023/03_0,16.245906
1,2023/03_1,26.134796
2,2023/03_2,11.884264
3,2023/03_3,11.99772
4,2023/03_4,10.234486


In [30]:
ouput_file = 'df_result.parquet'

df_result.to_parquet(
    ouput_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [40]:
!jupyter nbconvert --to script starter.ipynb 

[NbConvertApp] Converting notebook starter.ipynb to script
[NbConvertApp] Writing 1573 bytes to starter.py


In [32]:
nbconvert

NameError: name 'nbconvert' is not defined

In [43]:
df_result['prediction'].mean()

14.203865642696083