In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Upload the file
from google.colab import files
uploaded = files.upload()

Saving yellow_tripdata_2022-01.parquet to yellow_tripdata_2022-01.parquet


In [None]:
# Load the dataset into a pandas DataFrame (from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
df = pd.read_parquet('yellow_tripdata_2022-01.parquet')

In [None]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [None]:
# Display the shape of the dataset
df.shape

(2463931, 19)

In [None]:
# Check for missing values
print(df.isnull().sum())

VendorID                     0
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          71503
trip_distance                0
RatecodeID               71503
store_and_fwd_flag       71503
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge     71503
airport_fee              71503
dtype: int64


In [None]:
# Drop rows with missing values.
df = df.dropna()

In [None]:
# Check for missing values
print(df.isnull().sum())

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
airport_fee              0
dtype: int64


In [None]:
# Display the shape of the dataset
df.shape

(2392428, 19)

In [None]:
# Create new feature, 'trip_duration'.
df.loc[:, 'trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds().astype(int)

In [None]:
# Display the shape of the dataset
df.shape

(2392428, 20)

In [None]:
# Preview the result tpep_dropoff_datetime tpep_pickup_datetime  tpep_trip_duration_in_minutes
print(df[['tpep_dropoff_datetime', 'tpep_pickup_datetime', 'trip_duration']].head())

  tpep_dropoff_datetime tpep_pickup_datetime  trip_duration
0   2022-01-01 00:53:29  2022-01-01 00:35:40           1069
1   2022-01-01 00:42:07  2022-01-01 00:33:43            504
2   2022-01-01 01:02:19  2022-01-01 00:53:21            538
3   2022-01-01 00:35:23  2022-01-01 00:25:21            602
4   2022-01-01 01:14:20  2022-01-01 00:36:48           2252


In [None]:
# Drop the tpep_pickup_datetime and tpep_dropoff_datetime columns since they are Timestamps datatype
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

In [None]:
# creating new binary columns out of store_and_fwd_flag column
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map({'N': 0, 'Y': 1}).fillna(0).astype(int)
df = pd.get_dummies(df, columns=['store_and_fwd_flag']).astype(int)

In [None]:
df.head()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,trip_duration,store_and_fwd_flag_0,store_and_fwd_flag_1
0,1,2,3,1,142,236,1,14,3,0,3,0,0,21,2,0,1069,1,0
1,1,1,2,1,236,42,1,8,0,0,4,0,0,13,0,0,504,1,0
2,2,1,0,1,166,166,1,7,0,0,1,0,0,10,0,0,538,1,0
3,2,1,1,1,114,68,2,8,0,0,0,0,0,11,2,0,602,1,0
4,2,1,4,1,68,163,1,23,0,0,3,0,0,30,2,0,2252,1,0


In [None]:
# Split dataset into training and test sets
X = df.drop('fare_amount', axis=1)
y = df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
df.shape

(2392428, 19)

In [None]:
# Create a baseline for mean absolute error of total amount
y_true = df['fare_amount']               # These are the actual total amounts
baseline_prediction = [y_true.mean()] * len(y_true)   # Predict the average every time
baseline_mae = mean_absolute_error(y_true, baseline_prediction)   # Calculate error
print(f'Baseline MAE: {baseline_mae:.2f}')

Baseline MAE: 7.71


In [None]:
# inspect data types
print(df.dtypes)

VendorID                 int64
passenger_count          int64
trip_distance            int64
RatecodeID               int64
PULocationID             int64
DOLocationID             int64
payment_type             int64
fare_amount              int64
extra                    int64
mta_tax                  int64
tip_amount               int64
tolls_amount             int64
improvement_surcharge    int64
total_amount             int64
congestion_surcharge     int64
airport_fee              int64
trip_duration            int64
store_and_fwd_flag_0     int64
store_and_fwd_flag_1     int64
dtype: object


In [None]:
# Create a list called feature_col to store column names
feature_col = df.drop('fare_amount', axis=1).columns.tolist() ## exclude the target column
print(feature_col)

['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID', 'PULocationID', 'DOLocationID', 'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee', 'trip_duration', 'store_and_fwd_flag_0', 'store_and_fwd_flag_1']


In [None]:
# filter the data types into categorical and continuous features

# Identify categorical columns
categorical_cols = ['store_and_fwd_flag_0', 'store_and_fwd_flag_1']
print(categorical_cols)

numerical_cols = [col for col in feature_col if col not in ['store_and_fwd_flag_0', 'store_and_fwd_flag_1']]
print(numerical_cols)

['store_and_fwd_flag_0', 'store_and_fwd_flag_1']
['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID', 'PULocationID', 'DOLocationID', 'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee', 'trip_duration']


In [None]:
df.head()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,trip_duration,store_and_fwd_flag_0,store_and_fwd_flag_1
0,1,2,3,1,142,236,1,14,3,0,3,0,0,21,2,0,1069,1,0
1,1,1,2,1,236,42,1,8,0,0,4,0,0,13,0,0,504,1,0
2,2,1,0,1,166,166,1,7,0,0,1,0,0,10,0,0,538,1,0
3,2,1,1,1,114,68,2,8,0,0,0,0,0,11,2,0,602,1,0
4,2,1,4,1,68,163,1,23,0,0,3,0,0,30,2,0,2252,1,0


In [None]:
# Use Scikit-Learn's ColumnTransformer to preprocess the categorical and
# continuous features independently.

# Preprocessing pipelines
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

# Combine into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
df.shape

(2392428, 19)

In [None]:
# Create a pipeline object containing the column transformations and regression
# model.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# Fit the pipeline on the training data.
pipeline.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = pipeline.predict(X_test)

In [None]:
# Evaluate the model using mean absolute error as a metric on the test data.
# Does the model beat the baseline?
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')

Mean Absolute Error: 0.26


In [None]:
# Build random forest regressor model
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=200)

In [None]:
# Fit the pipeline on the training data.
rf_model.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Make predictions on the test data
y_pred = rf_model.predict(X_test)

In [None]:
# Evaluate the model using mean absolute error as a metric on the test data.
# Does the model beat the baseline?
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')

Mean Absolute Error: 0.26


In [None]:
# Define the hyperparameters to tune.
param_grid = {
    'n_estimators': [100, 200, 300, 500],            # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],            # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],            # Minimum samples required to split an internal node
}


In [None]:
# Perform grid search to find the best hyperparameters. This could take a while.
# Step 3: Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # You can change this to 'r2', 'neg_mean_squared_error', etc.
    n_jobs=-1,  # Use all processors
    return_train_score=True,
    refit=True
)

In [73]:
# Fit the best classifier on the training data.
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Get the best model and its parameters.
best_model = grid_search.estimator.estimator_
print("Best Model:", best_model)
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Model: DecisionTreeRegressor()


AttributeError: 'dict' object has no attribute 'best_params_'

In [None]:
# Make predictions on the test data
y_pred = best_model.predict(X_test)

In [None]:
# Evaluate the model from the previous step using the test data. How does your model perform?
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')