In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
DATA_TRAIN = "yellow_tripdata_2023-01.parquet"
DATA_VAL   = "yellow_tripdata_2023-02.parquet"

In [3]:
#  Q1 – number of columns (january)
df_raw = pd.read_parquet(DATA_TRAIN)
print("Q1 – number of columns (january):", df_raw.shape[1])

Q1 – number of columns (january): 19


In [4]:
#  Q2 – unfiltered standard deviation of duration
df_raw['duration'] = (df_raw.tpep_dropoff_datetime -
                      df_raw.tpep_pickup_datetime).dt.total_seconds() / 60
std_duration = df_raw['duration'].std()
print(f"Q2 – unfiltered std duration: {std_duration:.2f} min")

Q2 – unfiltered std duration: 42.59 min


In [5]:
#  Q3 – fraction of the records after filtering (1 - 60 minutes)
n_total = len(df_raw)
df_raw_filt = df_raw[(df_raw.duration >= 1) & (df_raw.duration <= 60)]
fraction = len(df_raw_filt) / n_total
print(f"Q3 – fraction of the records after filtering: {fraction:.3f}")

Q3 – fraction of the records after filtering: 0.981


In [6]:
#  reusable functions to read and prepare parquet
def read_dataframe(path):
    df = pd.read_parquet(path)

    df['duration'] = (df.tpep_dropoff_datetime -
                      df.tpep_pickup_datetime).dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    for col in ['PULocationID', 'DOLocationID']:
        df[col] = df[col].astype(str)

    return df

In [7]:
#  train and validation dataframes
df_train = read_dataframe(DATA_TRAIN)
df_val   = read_dataframe(DATA_VAL)

In [8]:
#  Vectorization (one‑hot) – it answers Q4
dv = DictVectorizer(sparse=True)

train_dicts = df_train[['PULocationID', 'DOLocationID']].to_dict("records")
X_train     = dv.fit_transform(train_dicts)
y_train     = df_train['duration'].values

val_dicts = df_val[['PULocationID', 'DOLocationID']].to_dict("records")
X_val      = dv.transform(val_dicts)
y_val      = df_val['duration'].values

print("Q4 – matrix dimension (nº columns):", X_train.shape[1])

Q4 – matrix dimension (nº columns): 515


In [9]:
#  Q5 / Q6 – model and metrics
lr = LinearRegression()
lr.fit(X_train, y_train)

rmse_train = sqrt(mean_squared_error(y_train, lr.predict(X_train)))
rmse_val   = sqrt(mean_squared_error(y_val,   lr.predict(X_val)))

print(f"Q5 – RMSE on train:      {rmse_train}")
print(f"Q6 – RMSE on validation: {rmse_val}")

Q5 – RMSE on train:      7.649261822035489
Q6 – RMSE on validation: 7.811821332387183
