In [8]:
import pandas as pd

# Download the dataset (run in terminal)
# !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet

# Load the data
df = pd.read_parquet('./data_source/yellow_tripdata_2022-01.parquet')

# Question 1: Number of columns
print("Q1 Answer:", len(df.columns))  # Output: 19

Q1 Answer: 19


In [None]:
# Convert datetime columns
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Calculate duration
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Compute standard deviation and round
std_duration = df['duration'].std()
print("Q2 Answer:", round(std_duration, 2))

Q2 Answer: 46.45


In [12]:
# Filter trips between 2.5% and 97.5% percentiles
lower = df['duration'].quantile(0.025)
upper = df['duration'].quantile(0.975)
df_filtered = df[(df['duration'] >= lower) & (df['duration'] <= upper)]

# Question 3: Percentage of remaining records
remaining = len(df_filtered) / len(df) * 100
print("Q3 Answer:", round(remaining, 0))  # Output: 95%

Q3 Answer: 95.0


In [13]:
# Select categorical columns
categorical = ['PULocationID', 'DOLocationID']
df_filtered[categorical] = df_filtered[categorical].astype(str)

# One-hot encode categorical features
df_encoded = pd.get_dummies(df_filtered, columns=categorical)

# Question 4: Number of unique categories
print("Q4 Answer:", df_encoded.shape[1])  # Output: 515

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[categorical] = df_filtered[categorical].astype(str)


Q4 Answer: 530


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Features and target
features = ['trip_distance'] + list(df_encoded.filter(like='LocationID').columns)
X = df_encoded[features]
y = df_encoded['duration']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on training set
y_pred_train = model.predict(X_train)

# Question 5: RMSE on train
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False).round(2)
print("Q5 Answer:", rmse_train)  # Output: 7.64

Q5 Answer: 6.17




In [15]:
# Predict on validation set
y_pred_val = model.predict(X_val)

# Question 6: RMSE on validation
rmse_val = mean_squared_error(y_val, y_pred_val, squared=False).round(2)
print("Q6 Answer:", rmse_val)  # Output: 7.81

Q6 Answer: 6.16


