In [16]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np



In [5]:

df = pd.read_parquet('data/yellow_tripdata_2023-01.parquet')
print(f"Number of columns: {len(df.columns)}")


Number of columns: 19


In [11]:
df = pd.read_parquet('data/yellow_tripdata_2023-01.parquet')
df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60

# Calculate standard deviation
std_duration = df['duration'].std()
print(f"Standard deviation of trip durations: {std_duration:.2f}")




Standard deviation of trip durations: 42.59


In [12]:
# Total records before filtering
total_records = len(df)

# Filter durations between 1 and 60 minutes inclusive
df_filtered = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

# Records left after filtering
filtered_records = len(df_filtered)

# Fraction remaining
fraction = filtered_records / total_records

print(f"Fraction of records left: {fraction:.2%}")

Fraction of records left: 98.12%


In [15]:
# Use only PULocationID and DOLocationID features, cast to str
df['PULocationID'] = df['PULocationID'].astype(str)
df['DOLocationID'] = df['DOLocationID'].astype(str)

# Create list of dicts for DictVectorizer
dicts = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# Fit DictVectorizer
dv = DictVectorizer()
X = dv.fit_transform(dicts)

# Number of columns
print(f"Number of features: {X.shape[1]}")

Number of features: 518


In [20]:
# Load data
df = pd.read_parquet('data/yellow_tripdata_2023-01.parquet')

# Compute duration
df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60

# Filter outliers (1 to 60 minutes)
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

# Categorical features
df['PULocationID'] = df['PULocationID'].astype(str)
df['DOLocationID'] = df['DOLocationID'].astype(str)

# Create feature matrix
dicts = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(dicts)
y_train = df['duration'].values

# Train Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict on training data
y_pred = lr.predict(X_train)
# RMSE without using 'squared=False'
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)

print(f'RMSE on training data: {rmse:.2f}')

RMSE on training data: 7.65


In [21]:
df_val = pd.read_parquet('data/yellow_tripdata_2023-02.parquet')

# Compute duration
df_val['duration'] = (df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime).dt.total_seconds() / 60

# Filter durations
df_val = df_val[(df_val['duration'] >= 1) & (df_val['duration'] <= 60)]

# Cast categorical columns to string
df_val['PULocationID'] = df_val['PULocationID'].astype(str)
df_val['DOLocationID'] = df_val['DOLocationID'].astype(str)

val_dicts = df_val[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_val = dv.transform(val_dicts)  # Reuse the DictVectorizer from training
y_val = df_val['duration'].values

y_pred = lr.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE on validation data: {rmse:.2f}')

RMSE on validation data: 7.81
