## Importing Datasets

In [None]:
import os
import pandas as pd

# Taxi Data
# https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
folder_path = "data/"
files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
dataframes = []

for f in files:
    df = pd.read_parquet(folder_path + f, engine='pyarrow')
    dataframes.append(df)

df = pd.concat(dataframes, ignore_index=True)
df = df.dropna()

# Borough Data
zones = pd.read_csv('taxi_zone_lookup.csv')
zones = zones.fillna('Others')

zone_columns = []
for z in zones['Borough'].unique():
    z_col = 'is_' + (str(z).lower()).replace(' ','')
    zone_columns.append(pd.Series(zones["Borough"] == z, name=z_col))

zones = pd.concat([zones] + zone_columns, axis=1)
zones = zones.drop(['service_zone'], axis=1)
zones = zones.rename(columns={'LocationID': 'PULocationID'})

## Data Engineering

In [None]:
# Data Engineering
from sklearn import preprocessing

# Travel Time
df['travel_time'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).astype('int64') // 60_000_000_000

# DateTime to Numeric
df['pickup_month'] = df['tpep_pickup_datetime'].dt.month
df['pickup_day'] = df['tpep_pickup_datetime'].dt.day
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_minute'] = df['tpep_pickup_datetime'].dt.minute
df['pickup_second'] = df['tpep_pickup_datetime'].dt.second

# Month Factors
for h in range(len(df['pickup_month'].unique())):
    df['month_'+str(h)] = (h==df['pickup_month'])
    
# Time Factors
for h in range(len(df['pickup_hour'].unique())):
    df['hour_'+str(h)] = (h==df['pickup_hour'])

# Location Factors
df = pd.merge(df, zones, on='PULocationID', how='left')

# Hourly Pickup Rates (Predictor)
df['tpep_pickup_date'] = df['tpep_pickup_datetime'].dt.date
df['tpep_pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
hourly_pickup_rates = df.groupby(['Borough', 'tpep_pickup_date', 'tpep_pickup_hour']).size().reset_index(name='hourly_pickup_count')
df = df.merge(hourly_pickup_rates, on=['Borough', 'tpep_pickup_date', 'tpep_pickup_hour'], how='left')

# Drop Non-Numerics
for c in ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag', 'Borough', 'Zone', 'tpep_pickup_date', 'tpep_pickup_hour']:
    if c in df:
        del df[c]

# Normalizing Data
x = df.values
min_max_scaler = preprocessing.MinMaxScaler()
df = min_max_scaler.fit_transform(x)

## Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)  # 70% train
train_x, train_y = train_df[:,:-1], train_df[:,-1]
test_x, test_y = test_df[:,:-1], test_df[:,-1]

## Linear Regression

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import time 

start = time.time()
regr = SGDRegressor(max_iter=500, tol=0.0001, penalty=None, learning_rate='invscaling', eta0=0.005, n_iter_no_change=5)
regr.fit(train_x, train_y)
end = time.time()
print("Train Time:", end - start)

# R^2
start = time.time()
print("R^2:", r2_score(regr.predict(train_x), train_y),
     r2_score(regr.predict(test_x), test_y))

# MSE
print("MSE:", mean_squared_error(regr.predict(train_x), train_y),
     mean_squared_error(regr.predict(test_x), test_y))
end = time.time()
print("Test Time:", end - start)

## L2 Regularization

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline

start = time.time()
regr = SGDRegressor(alpha=0.0001, max_iter=500, tol=0.0001, penalty='l2', learning_rate='invscaling', eta0=0.005, n_iter_no_change=5)
regr.fit(train_x, train_y)
end = time.time()
print("Train Time:", end - start)

# R^2
start = time.time()
print("R^2:", r2_score(regr.predict(train_x), train_y),
     r2_score(regr.predict(test_x), test_y))

# MSE
print("MSE:", mean_squared_error(regr.predict(train_x), train_y),
     mean_squared_error(regr.predict(test_x), test_y))
end = time.time()
print("Test Time:", end - start)

## L1 Regularization

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline

start = time.time()
regr = SGDRegressor(alpha=0.0001, max_iter=500, tol=0.0001, penalty='l1', learning_rate='invscaling', eta0=0.005, n_iter_no_change=5)
regr.fit(train_x, train_y)
end = time.time()
print("Train Time:", end - start)

# R^2
start = time.time()
print("R^2:", r2_score(regr.predict(train_x), train_y),
     r2_score(regr.predict(test_x), test_y))

# MSE
print("MSE:", mean_squared_error(regr.predict(train_x), train_y),
     mean_squared_error(regr.predict(test_x), test_y))
end = time.time()
print("Test Time:", end - start)

## ElasticNet (L1 + L2)

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline

start = time.time()
regr = SGDRegressor(alpha=0.0001, max_iter=500, tol=0.0001, penalty='elasticnet', l1_ratio=0.5, learning_rate='invscaling', eta0=0.005, n_iter_no_change=5)
regr.fit(train_x, train_y)
end = time.time()
print("Train Time:", end - start)

# R^2
start = time.time()
print("R^2:", r2_score(regr.predict(train_x), train_y),
     r2_score(regr.predict(test_x), test_y))

# MSE
print("MSE:", mean_squared_error(regr.predict(train_x), train_y),
     mean_squared_error(regr.predict(test_x), test_y))
end = time.time()
print("Test Time:", end - start)