In [42]:
import pickle
import pandas as pd

In [43]:
with open('model.bin','rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [44]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')

    return df


In [45]:
#
year = 2023
month = '03'

In [46]:
df = read_data(f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet')

In [47]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [50]:
y_pred

array([16.24590642, 26.1347962 , 11.88426424, ..., 11.59533603,
       13.11317847, 12.89999218])

What's the standard deviation of the predicted duration for this dataset?

In [48]:
round(y_pred.std(),2)

6.25

### Q2. Preparing the output

In [49]:
df['ride_id'] = f'{year}/{month}_' + df.index.astype('str')

In [None]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration,ride_id
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.00,1.0,N,238,42,2,...,1.0,0.5,0.00,0.0,1.0,11.10,0.0,0.00,10.000000,2023/03_0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.40,1.0,N,138,231,1,...,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333,2023/03_1
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.30,1.0,N,140,186,1,...,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.00,14.366667,2023/03_2
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.90,1.0,N,140,43,1,...,3.5,0.5,4.10,0.0,1.0,24.70,2.5,0.00,11.466667,2023/03_3
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,...,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.00,3.033333,2023/03_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3403761,2,2023-03-31 23:24:25,2023-03-31 23:40:54,,3.16,,,163,75,0,...,0.0,0.5,4.23,0.0,1.0,20.36,,,16.483333,2023/03_3403761
3403762,2,2023-03-31 23:24:50,2023-04-01 00:04:12,,6.89,,,125,198,0,...,0.0,0.5,8.98,0.0,1.0,53.90,,,39.366667,2023/03_3403762
3403763,2,2023-03-31 23:26:31,2023-03-31 23:49:39,,4.01,,,50,224,0,...,0.0,0.5,0.00,0.0,1.0,28.02,,,23.133333,2023/03_3403763
3403764,2,2023-03-31 23:07:51,2023-03-31 23:15:56,,1.31,,,113,158,0,...,0.0,0.5,3.50,0.0,1.0,16.01,,,8.083333,2023/03_3403764


# Q2

In [52]:
df_results = pd.DataFrame(y_pred, columns=['predicted_duration'],)

df_results['predicted_duration'] = df_results['predicted_duration'].round(2)

print(df_results.head())

   predicted_duration
0               16.25
1               26.13
2               11.88
3               12.00
4               10.23


In [53]:
df_results['ride_id'] = df['ride_id']

In [54]:
df_results

Unnamed: 0,predicted_duration,ride_id
0,16.25,2023/03_0
1,26.13,2023/03_1
2,11.88,2023/03_2
3,12.00,2023/03_3
4,10.23,2023/03_4
...,...,...
3316211,11.95,2023/03_3316211
3316212,20.05,2023/03_3316212
3316213,11.60,2023/03_3316213
3316214,13.11,2023/03_3316214


In [55]:
df_results['predicted_duration'].head()

Unnamed: 0,predicted_duration
0,16.25
1,26.13
2,11.88
3,12.0
4,10.23


In [56]:
output_file = 'results.parquet'
df_results.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [57]:
import os

file_size = os.path.getsize('results.parquet')


file_size_kb = file_size / 1024  # Kilobytes
file_size_mb = file_size / (1024 * 1024)  # Megabytes

print(f"File size: {file_size_kb:.2f} KB")
print(f"File size: {file_size_mb:.2f} MB")

File size: 64543.43 KB
File size: 63.03 MB


In [58]:
data = pd.read_parquet('results.parquet')

In [None]:
data

Unnamed: 0,predicted_duration,ride_id
0,16.25,2023/03_0
1,26.13,2023/03_1
2,11.88,2023/03_2
3,12.00,2023/03_3
4,10.23,2023/03_4
...,...,...
3316211,11.95,2023/03_3316211
3316212,20.05,2023/03_3316212
3316213,11.60,2023/03_3316213
3316214,13.11,2023/03_3316214


## Q3. Creating the scoring script

In [None]:
jupyter nbconvert --to script homework_1.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

In [None]:
!jupyter nbconvert --to script homework-1

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

In [None]:
j