In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer

In [2]:
# Reading files
df_jan = pd.read_parquet('../../../data/yellow_tripdata_2023-01.parquet')
df_feb = pd.read_parquet('../../../data/yellow_tripdata_2023-02.parquet')

In [3]:
# Question 1: How many columns there are (in January data)?
len(df_jan.columns)

19

In [4]:
# Question 2: Std of duration variable
df_jan['duration'] = round((df_jan['tpep_dropoff_datetime'] - df_jan['tpep_pickup_datetime']).dt.total_seconds() / 60, 3)
std_duration = round(np.std(df_jan['duration']), 3).item()
std_duration

42.594

In [5]:
# Question 3: Filter the outliers
df_jan_filtered = df_jan[(df_jan['duration'] >= 1) & (df_jan['duration'] <= 60)]
fraction_after_outliers = round(len(df_jan_filtered) / len(df_jan), 3) * 100
print(fraction_after_outliers, '%')

98.1 %


In [6]:
# Question 4: Dimensionality of the features matrix (number of columns)
# Subset & cast to str
df_ids = df_jan_filtered[["PULocationID", "DOLocationID"]].astype(str)
# To list of dicts
records = df_ids.to_dict(orient="records")
# Fit the vectorizer
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=True)
dv.fit(records)
# Transform to feature matrix
X = dv.transform(records)

# Dimensionality
n_samples, n_features = X.shape
print("Matrix shape:", X.shape)
print("Number of one-hot columns:", n_features)

Matrix shape: (3009173, 515)
Number of one-hot columns: 515


In [7]:
# Question 5: Model training and RMSE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

df = df_jan_filtered.copy()
y = df["duration"].values

# Instantiate and fit the model
model = LinearRegression()
model.fit(X, y)

# Predict on the training set
y_pred = model.predict(X)

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"Training RMSE: {rmse:.3f}")

Training RMSE: 7.649


In [9]:
# Question 6: RMSE on the test set
df_feb['duration'] = round((df_feb['tpep_dropoff_datetime'] - df_feb['tpep_pickup_datetime']).dt.total_seconds() / 60, 3)
df_feb = df_feb[(df_feb.duration >= 1) & (df_feb.duration <= 60)]
X_feb = dv.transform(df_feb[["PULocationID", "DOLocationID"]].astype(str).to_dict(orient="records"))
y_feb = df_feb["duration"].values
y_pred_feb = model.predict(X_feb)

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_feb, y_pred_feb))
print(f"Training RMSE: {rmse:.3f}")


Training RMSE: 7.812
