In [1]:
!python -V

Python 3.12.7


In [11]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [3]:
df = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
len(df.columns)

19

In [7]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
df.duration.describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

In [10]:
num_rows_before = len(df)
df = df[(df.duration >= 1) & (df.duration <= 60)]
num_rows_after = len(df)
print(f"Rows before filtering: {num_rows_before}")
print(f"Rows after filtering: {num_rows_after}")
print(f"Fraction of the records left: {num_rows_after/num_rows_before:.2%}")

Rows before filtering: 3066766
Rows after filtering: 3009173
Fraction of the records left: 98.12%


In [20]:
categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].astype(str)
train_dicts = df[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_train.shape

(3009173, 515)

In [21]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_train, y_pred)

7.649262060255514

In [22]:
df_val = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')
df_val['duration'] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
df_val[categorical] = df_val[categorical].astype(str)
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
X_val.shape
y_val = df_val[target].values

y_pred = lr.predict(X_val)
root_mean_squared_error(y_val, y_pred)

7.811816183354732