In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import root_mean_squared_error

In [3]:
df = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')

print(f"Taxi dataset has {len(df.columns)} columns")

df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

len1 = len(df)
print(f"Duration std: {df.duration.std():.2f} minutes")

df = df[(df.duration >= 1) & (df.duration <= 60)]

len2 = len(df)

print(f"Removed {len1 - len2} rows with duration outside [1, 60]")
print(f"Dropped {1 - len2 / len1:.2%} of rows. {len2/len1:.2%} left")

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df[categorical] = df[categorical].astype(str)

Taxi dataset has 19 columns
Duration std: 42.59 minutes
Removed 57593 rows with duration outside [1, 60]
Dropped 1.88% of rows. 98.12% left


In [4]:
df2 = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')

df2['duration'] = df2.tpep_dropoff_datetime - df2.tpep_pickup_datetime
df2.duration = df2.duration.apply(lambda td: td.total_seconds() / 60)

len1 = len(df2)
print(f"Duration std: {df2.duration.std():.2f} minutes")

df2 = df2[(df2.duration >= 1) & (df2.duration <= 60)]

len2 = len(df2)

print(f"Removed {len1 - len2} rows with duration outside [1, 60]")
print(f"Dropped {1 - len2 / len1:.2%} of rows. {len2/len1:.2%} left")

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df2[categorical] = df2[categorical].astype(str)

Duration std: 42.84 minutes
Removed 58004 rows with duration outside [1, 60]
Dropped 1.99% of rows. 98.01% left


In [5]:
df2

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.30,1.0,N,142,163,2,4.40,3.50,0.5,0.00,0.0,1.0,9.40,2.5,0.00,1.683333
3,1,2023-02-01 00:29:33,2023-02-01 01:01:38,0.0,18.80,1.0,N,132,26,1,70.90,2.25,0.5,0.00,0.0,1.0,74.65,0.0,1.25,32.083333
4,2,2023-02-01 00:12:28,2023-02-01 00:25:46,1.0,3.22,1.0,N,161,145,1,17.00,1.00,0.5,3.30,0.0,1.0,25.30,2.5,0.00,13.300000
5,1,2023-02-01 00:52:40,2023-02-01 01:07:18,1.0,5.10,1.0,N,148,236,1,21.90,3.50,0.5,5.35,0.0,1.0,32.25,2.5,0.00,14.633333
6,1,2023-02-01 00:12:39,2023-02-01 00:40:36,1.0,8.90,1.0,N,137,244,1,41.50,3.50,0.5,3.50,0.0,1.0,50.00,2.5,0.00,27.950000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2913950,2,2023-02-28 23:46:00,2023-03-01 00:05:00,,4.65,,,249,140,0,20.22,0.00,0.5,4.84,0.0,1.0,29.06,,,19.000000
2913951,2,2023-02-28 23:26:02,2023-02-28 23:37:10,,2.47,,,186,79,0,13.66,0.00,0.5,2.65,0.0,1.0,20.31,,,11.133333
2913952,2,2023-02-28 23:24:00,2023-02-28 23:38:00,,3.49,,,158,143,0,17.64,0.00,0.5,0.00,0.0,1.0,21.64,,,14.000000
2913953,2,2023-02-28 23:03:00,2023-02-28 23:10:00,,2.13,,,79,162,0,13.56,0.00,0.5,2.63,0.0,1.0,20.19,,,7.000000


In [6]:
train_dicts = df[categorical].to_dict(orient='records')
val_dicts = df2[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

print(X_train.shape)

target = 'duration'
y_train = df[target].values
y_val = df2[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
y_pred2 = lr.predict(X_val)

train_error = root_mean_squared_error(y_train, y_pred)
val_error = root_mean_squared_error(y_val, y_pred2)

print(f"Train RMSE: {train_error:.2f} minutes")
print(f"Val RMSE: {val_error:.2f} minutes")

(3009173, 515)
Train RMSE: 7.65 minutes
Val RMSE: 7.81 minutes
