In [4]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

--2025-05-10 00:16:20--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.239.38.181, 18.239.38.163, 18.239.38.147, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.239.38.181|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47673370 (45M) [application/x-www-form-urlencoded]
Saving to: ‘yellow_tripdata_2023-01.parquet.1’


2025-05-10 00:16:20 (330 MB/s) - ‘yellow_tripdata_2023-01.parquet.1’ saved [47673370/47673370]

--2025-05-10 00:16:20--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.239.38.163, 18.239.38.181, 18.239.38.147, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.239.38.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47748012 (46M) [ap

In [5]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_squared_error

In [6]:
def preprocess_trip_data(data):

  data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])
  data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])

  categorical = ['PULocationID', 'DOLocationID']
  data[categorical] = data[categorical].astype(str)

  return data

In [7]:
def add_trip_duration(data):

  data["Duration"] = data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']
  data["Duration"] = data["Duration"].apply(lambda td: td.total_seconds() / 60)

  return data

In [8]:
def filter(data):

  data = data[(data["Duration"] >= 1) & (data["Duration"] <= 60)]
  return data

In [10]:
data_train = pd.read_parquet('/workspaces/mlops-zoomcamp/01-intro/yellow_tripdata_2023-01.parquet')
data_val = pd.read_parquet('/workspaces/mlops-zoomcamp/01-intro/yellow_tripdata_2023-02.parquet')

In [11]:
data_train = preprocess_trip_data(data_train)
data_val = preprocess_trip_data(data_val)

In [12]:
data_train.shape, data_val.shape

((3066766, 19), (2913955, 19))

In [13]:
data_train = add_trip_duration(data_train)
data_val = add_trip_duration(data_val)

In [14]:
data_train['Duration'].std()

42.594351241920904

In [15]:
data_train = filter(data_train)
data_val = filter(data_val)

In [16]:
data_train.shape, data_val.shape

((3009173, 20), (2855951, 20))

In [18]:
data_train.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,Duration
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,8.433333
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,6.316667
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0,12.75
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25,9.616667
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0,10.833333


In [17]:
dv = DictVectorizer()
categorical = ['PULocationID', 'DOLocationID']

train_dict = data_train[categorical].to_dict(orient="records")
X_train = dv.fit_transform(train_dict)
y_train = data_train['Duration'].values
print(len(dv.feature_names_))

val_dict = data_val[categorical].to_dict(orient="records")
X_val = dv.transform(val_dict)
y_val = data_val['Duration'].values

515


In [19]:
reg = LinearRegression().fit(X_train, y_train)

In [26]:
train_pred = reg.predict(X_train)
train_pred

array([11.52724167, 10.89779158, 11.32554809, ..., 11.73765947,
       12.70523459, 11.54227941])

In [20]:
y_pred = reg.predict(X_val)

In [27]:
rmse_train = root_mean_squared_error(y_train, train_pred)
rmse_train

7.649261959465432

In [22]:
rmse_val = root_mean_squared_error(y_val, y_pred)
rmse_val

7.811818882650574