In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

In [2]:
# read the data
df_train = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
df_val = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')
print('Q1 Number of columns in train:', df_train.shape[1])

# Compute the trip duration, the label for the later model
df_train['duration'] = df_train.tpep_dropoff_datetime - df_train.tpep_pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)

print('Q2 Std of trip duration:', df_train.duration.std())
print('Q3 Remained records after dropping outliers:', (df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]).shape[0]/df_train.shape[0])
# filter the data to remove outliers
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
# take pickup and dropoff locations as training features
categorical = ['PULocationID', 'DOLocationID']
df_train[categorical]=df_train[categorical].astype(str)
# Gernerate the one-hot encoding for the categorical features
train_dicts = df_train[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

print('Q4 Dimension of one-hot encoding:', X_train.shape[1])
# train the model and calculate the training loss
target = 'duration'
y_train = df_train[target].values
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
loss_train=root_mean_squared_error(y_train, y_pred)
print('Q5 Training RMSE:', loss_train)

Q1 Number of columns in train: 19
Q2 Std of trip duration: 42.59435124195458
Q3 Remained records after dropping outliers: 0.9812202822125979
Q4 Dimension of one-hot encoding: 515
Q5 Training RMSE: 7.6492622258678935


In [4]:
# compute the validation loss
df_val['duration'] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
df_val[categorical]=df_val[categorical].astype(str)
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

y_val = df_val[target].values
y_val_pred = lr.predict(X_val)
loss_val=root_mean_squared_error(y_val, y_val_pred)
print('Q6 Validation RMSE:', loss_val)

Q6 Validation RMSE: 7.811813648526847
