In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer

In [2]:
def load_dataset(filename):
    # Read the parquet file
    fhv_jan_data = pd.read_parquet(filename)
    print(f"Q1: There are {len(fhv_jan_data):_} records in the {filename.split('/')[-1]} dataset.")

    # Compute the duration of each ride in minutes
    fhv_jan_data["duration_minutes"] = (fhv_jan_data["dropOff_datetime"] - fhv_jan_data["pickup_datetime"]).dt.total_seconds() / 60
    fhv_jan_data["duration_minutes"] = fhv_jan_data["duration_minutes"].round(decimals=2)
    print(f"Q2: The average trip duration in minutes is {fhv_jan_data['duration_minutes'].mean():.2f}")

    # Filter out outliers by keeping trips between 1 and 60 minutes
    min_duration = 1
    max_duration = 60
    filtered_fhv_jan_data = fhv_jan_data.query("@min_duration <= duration_minutes and duration_minutes <= @max_duration").copy()
    num_dropped_records = len(fhv_jan_data) - len(filtered_fhv_jan_data)
    print(f"Q3: Only keeping trips with duration between {min_duration} and {max_duration} minutes: {num_dropped_records:_} records dropped")

    # Filling missing values 
    filtered_fhv_jan_data["PUlocationID"] = filtered_fhv_jan_data["PUlocationID"].fillna(-1)
    filtered_fhv_jan_data["DOlocationID"] = filtered_fhv_jan_data["DOlocationID"].fillna(-1)
    pct_na_po_location_ids = len(filtered_fhv_jan_data.query("PUlocationID < 0 ")) / len(filtered_fhv_jan_data)
    pct_na_do_location_ids = len(filtered_fhv_jan_data.query("DOlocationID < 0 ")) / len(filtered_fhv_jan_data.query("DOlocationID >= 0 "))
    print(f"Q3: {pct_na_po_location_ids * 100:.2f}% of records have missing pickup location ids.")
    print(f"Q3: {pct_na_do_location_ids * 100:.2f}% of records have missing dropoff location ids.")

    # Create train x and y
    print("Vectorizing features...") 
    categorical = ["PUlocationID", "DOlocationID"]
    train_dicts = filtered_fhv_jan_data[categorical].to_dict(orient="records")
    dv = DictVectorizer()
    X_train = dv.fit_transform(train_dicts)
    print(f"Q4: X_train has {X_train.shape[1]} columns")

    target = ["duration_minutes"]
    y_train = filtered_fhv_jan_data[target].values
    
    return X_train, y_train

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load
train_filename = "./data/fhv_tripdata_2021-01.parquet"
X_train, y_train = load_dataset(train_filename)

# Train
lr = LinearRegression()
lr.fit(X_train, y_train)

# Evaluate
y_pred = lr.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred, squared=False)

print(f"Q5: RMSE on training data: {rmse_train:.2f}")

Q1: There are 1_154_112 records in the fhv_tripdata_2021-01.parquet dataset.
Q2: The average trip duration in minutes is 19.17
Q3: Only keeping trips with duration between 1 and 60 minutes: 44_286 records dropped
Q3: 83.53% of records have missing pickup location ids.
Q3: 15.38% of records have missing dropoff location ids.
Vectorizing features...
Q4: X_train has 2 columns
Q5: RMSE on training data: 11.42


In [4]:
# Load
val_filename = "./data/fhv_tripdata_2021-02.parquet"
X_val, y_val = load_dataset(val_filename)

# Evaluate
y_pred = lr.predict(X_val)
rmse_val = mean_squared_error(y_val, y_pred, squared=False)

print(f"Q5: RMSE on training data: {rmse_val:.2f}")

Q1: There are 1_037_692 records in the fhv_tripdata_2021-02.parquet dataset.
Q2: The average trip duration in minutes is 20.71
Q3: Only keeping trips with duration between 1 and 60 minutes: 47_579 records dropped
Q3: 85.71% of records have missing pickup location ids.
Q3: 15.75% of records have missing dropoff location ids.
Vectorizing features...
Q4: X_train has 2 columns
Q5: RMSE on training data: 11.86
