In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [4]:
df= pd.read_parquet("yellow_tripdata_2022-01.parquet")

In [5]:
print(df.head())

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2022-01-01 00:35:40   2022-01-01 00:53:29              2.0   
1         1  2022-01-01 00:33:43   2022-01-01 00:42:07              1.0   
2         2  2022-01-01 00:53:21   2022-01-01 01:02:19              1.0   
3         2  2022-01-01 00:25:21   2022-01-01 00:35:23              1.0   
4         2  2022-01-01 00:36:48   2022-01-01 01:14:20              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           3.80         1.0                  N           142           236   
1           2.10         1.0                  N           236            42   
2           0.97         1.0                  N           166           166   
3           1.09         1.0                  N           114            68   
4           4.30         1.0                  N            68           163   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [7]:
df_cleaned = df.dropna()
print(f"Rows before cleaning: {len(df)}")
print(f"Rows after cleaning: {len(df_cleaned)}")

Rows before cleaning: 2463931
Rows after cleaning: 2392428


In [9]:
df["trip_duration"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60
print(df[["tpep_pickup_datetime", "tpep_dropoff_datetime", "trip_duration"]].head())

  tpep_pickup_datetime tpep_dropoff_datetime  trip_duration
0  2022-01-01 00:35:40   2022-01-01 00:53:29      17.816667
1  2022-01-01 00:33:43   2022-01-01 00:42:07       8.400000
2  2022-01-01 00:53:21   2022-01-01 01:02:19       8.966667
3  2022-01-01 00:25:21   2022-01-01 00:35:23      10.033333
4  2022-01-01 00:36:48   2022-01-01 01:14:20      37.533333


In [10]:
feature_col = df.columns.tolist()
print(feature_col)

['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee', 'trip_duration']


In [12]:
X = df[feature_col]
y = df["total_amount"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape}, {y_train.shape}")
print(f"Test set size: {X_test.shape}, {y_test.shape}")

Training set size: (1971144, 20), (1971144,)
Test set size: (492787, 20), (492787,)


In [13]:
baseline_prediction = y_train.mean()
y_pred_baseline = [baseline_prediction] * len(y_test)
baseline_mae = mean_absolute_error(y_test, y_pred_baseline)
print(f"Baseline MAE for total amount: ${baseline_mae:.2f}")

Baseline MAE for total amount: $9.26


In [14]:
print(df.columns.tolist())

['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee', 'trip_duration']


In [15]:
categorical_features = ['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type']
continuous_features = ['passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
                       'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee', 'trip_duration']


In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), continuous_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

In [18]:
y = df["total_amount"]
X_train, X_test, y_train, y_test = train_test_split(df[categorical_features + continuous_features], y, test_size=0.2, random_state=42)
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print(f"Processed training set shape: {X_train_processed.shape}")
print(f"Processed test set shape: {X_test_processed.shape}")

Processed training set shape: (1971144, 551)
Processed test set shape: (492787, 551)


In [22]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", imputer),
            ("scaler", StandardScaler())
        ]), continuous_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [24]:
regressor = LinearRegression()
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", regressor)
])

In [25]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE on test set: {mae:.2f}")

MAE on test set: 0.00


In [29]:
numeric_columns = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_columns.corr()
print(correlation_matrix["total_amount"].sort_values(ascending=False))

total_amount             1.000000
fare_amount              0.999875
tolls_amount             0.039112
tip_amount               0.038569
airport_fee              0.034335
trip_duration            0.009962
improvement_surcharge    0.009777
RatecodeID               0.006311
extra                    0.002474
VendorID                 0.001149
passenger_count          0.001123
mta_tax                  0.000960
trip_distance            0.000505
DOLocationID            -0.004904
payment_type            -0.005173
PULocationID            -0.006022
congestion_surcharge    -0.010224
Name: total_amount, dtype: float64


In [30]:
categorical_features = ['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type']
continuous_features = ['passenger_count', 'trip_distance', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
                       'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee', 'trip_duration']


In [37]:
categorical_imputer = SimpleImputer(strategy="most_frequent")
continuous_imputer = SimpleImputer(strategy="mean")

In [38]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", imputer),
            ("scaler", StandardScaler())
        ]), continuous_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

In [39]:
regressor = LinearRegression()
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", regressor)
])

In [40]:
y = df["total_amount"]
X_train, X_test, y_train, y_test = train_test_split(df[categorical_features + continuous_features], y, test_size=0.2, random_state=42)
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print(f"Processed training set shape: {X_train_processed.shape}")
print(f"Processed test set shape: {X_test_processed.shape}")

Processed training set shape: (1971144, 550)
Processed test set shape: (492787, 550)


In [41]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE on test set: {mae:.2f}")

MAE on test set: 0.00


In [44]:
from sklearn.linear_model import Ridge

In [45]:
pipeline.set_params(regressor=Ridge(alpha=1.0))


In [46]:
pipeline.fit(X_train, y_train)


In [48]:
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE on test set: {mae:.2f}")

MAE on test set: 0.03


In [95]:
X = df[categorical_features + continuous_features]
y = df['total_amount']
categorical_features = ['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'payment_type']
continuous_features = ['passenger_count', 'trip_distance', 'extra', 'mta_tax', 'tip_amount',
                       'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee', 'trip_duration']


In [85]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [86]:
X_train_sample = X_train.sample(frac=0.1, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

In [96]:
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), continuous_features)
])

In [93]:
X_train_sample = X_train.sample(frac=0.1, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

In [94]:
pipeline.fit(X_train_sample, y_train_sample)

In [98]:
for col in categorical_features:
    train_unique = set(X_train[col].unique())
    test_unique = set(X_test[col].unique())
    new_categories = test_unique - train_unique

    if new_categories:
        print(f"New categories in {col}: {new_categories}")

New categories in RatecodeID: {nan}


In [99]:
df[categorical_features] = df[categorical_features].fillna("Unknown")

In [100]:
for col in categorical_features:
    df[col] = df[col].astype(str)

In [101]:
df_sample = df.sample(frac=0.1, random_state=42)

In [102]:
y = df_sample["total_amount"]
X_train, X_test, y_train, y_test = train_test_split(df_sample[categorical_features + continuous_features], y, test_size=0.2, random_state=42)

In [103]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [104]:
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE on test set: {mae:.2f}")

MAE on test set: 0.50


In [113]:
param_grid = {
    "regressor__n_estimators": [50, 100],
    "regressor__max_depth": [None, 10],
    "regressor__min_samples_split": [2, 5],
    "regressor__min_samples_leaf": [1, 2]
}

In [122]:
print(param_grid)

{'regressor__n_estimators': [50, 100], 'regressor__max_depth': [None, 10], 'regressor__min_samples_split': [2, 5], 'regressor__min_samples_leaf': [1, 2]}


In [124]:
print(grid_search.best_params_)

{'regressor__max_depth': None, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}


In [119]:
grid_search = GridSearchCV(pipeline, param_grid, cv=2, scoring="neg_mean_absolute_error", verbose=1, n_jobs=-1)

In [120]:
X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

In [121]:
grid_search.fit(X_train_small, y_train_small)

Fitting 2 folds for each of 16 candidates, totalling 32 fits


In [125]:
best_model = grid_search.best_estimator_

In [128]:
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

Best Hyperparameters: {'regressor__max_depth': None, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}


In [129]:
best_model.fit(X_train, y_train)

In [132]:
y_pred = best_model.predict(X_test)
print(y_pred)

[ 21.95     9.67    18.1207 ... 130.5054  69.599   10.56  ]
