### Big data course project
<strong>T7: Forecast Uber waiting time</strong>

Jovana Videnovic & Haris Kupinic

In [None]:
!hostnamectl

In [None]:
from xgboost import XGBRegressor
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
from pathlib import Path
import pandas as pd
import os
import numpy as np
from sklearn.neighbors import BallTree
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.compute as pc
from datetime import timedelta
from dask_ml.linear_model import LinearRegression
from dask_ml.ensemble import BlockwiseVotingRegressor
import dask.array as da
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from xgboost import dask as dxgb
import lightgbm as lgb
from sklearn.linear_model import SGDRegressor

In [None]:
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LinearRegression
from dask_ml.preprocessing import Categorizer
from dask_ml.wrappers import Incremental

In [None]:
def bootstrapping(errors, n_iterations=1000):
    """Perform bootstrapping to estimate confidence intervals."""
    n_size = len(errors)
    indices = np.random.randint(0, n_size, (n_iterations, n_size))
    samples = errors[indices]
    means = np.mean(samples, axis=1)
    lower = np.percentile(means, 2.5)
    upper = np.percentile(means, 97.5)
    return lower, upper

In [None]:
cluster = LocalCluster(n_workers=2, threads_per_worker=2, memory_limit='32GB')
client = Client(cluster)

#### Loading and processing

In [None]:
data_path = Path("/d/hpc/projects/FRI/bigdata/students/jv8043/partitioned_data/fhvhv/2022")

In [None]:
ddf = dd.read_parquet(data_path, engine="pyarrow", assume_missing=True)

In [None]:
ddf.columns

In [None]:
ddf = ddf[ddf["hvfhs_license_num"] == "HV0003"]

In [None]:
important_columns = [
    "request_datetime",
    "on_scene_datetime",
    "pulocationid"
]

In [None]:
ddf = ddf[important_columns]

In [None]:
ddf["request_datetime"] = dd.to_datetime(ddf["request_datetime"])
ddf["on_scene_datetime"] = dd.to_datetime(ddf["on_scene_datetime"])
ddf["pulocationid"] = ddf["pulocationid"].astype("int64")

In [None]:
# drop nan
ddf = ddf.dropna(subset=["request_datetime", "on_scene_datetime", "pulocationid"])

In [None]:
ddf['wait_time'] = (ddf['on_scene_datetime'] - ddf['request_datetime']).dt.total_seconds() / 60
ddf['wait_time'] = ddf['wait_time'].astype('int64')
ddf = ddf[ddf['wait_time'] >= 0]
ddf = ddf[ddf['wait_time'] <= 30]  # Assuming wait time is capped at 30 minutes

In [None]:
del ddf['on_scene_datetime']

In [None]:
ddf

#### Feature engineering

In [None]:
def get_season(dt):
    month = dt.month
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    else:
        return 'autumn'

In [None]:

def one_hot_encode(train_df, test_df, categorical_cols):
    train_df = train_df.categorize(columns=categorical_cols)
    print(categorical_cols, test_df.columns)
    test_df = test_df.categorize(columns=categorical_cols)

    # One-hot encode using Dask
    train_encoded = dd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
    test_encoded = dd.get_dummies(test_df, columns=categorical_cols, drop_first=True)
    
    # Align columns in test to match train
    missing_cols = [col for col in train_encoded.columns if col not in test_encoded.columns]
    for col in missing_cols:
        test_encoded[col] = 0

    extra_cols = [col for col in test_encoded.columns if col not in train_encoded.columns]
    test_encoded = test_encoded.drop(columns=extra_cols)

    # Ensure column order matches
    test_encoded = test_encoded[train_encoded.columns]

    return train_encoded, test_encoded

In [None]:
ddf['request_dayofweek'] = ddf['request_datetime'].dt.dayofweek
# create sin and cos features for hour of day
ddf['request_hour'] = ddf['request_datetime'].dt.hour
ddf['request_minute'] = ddf['request_datetime'].dt.minute

ddf['request_sin_hour'] = np.sin(2 * np.pi * ddf['request_hour'] / 24)
ddf['request_cos_hour'] = np.cos(2 * np.pi * ddf['request_hour'] / 24)

ddf['request_sin_minute'] = np.sin(2 * np.pi * ddf['request_minute'] / 60)
ddf['request_cos_minute'] = np.cos(2 * np.pi * ddf['request_minute'] / 60)

ddf['request_day_of_week_sin'] = np.sin(2 * np.pi * ddf['request_dayofweek'] / 7)
ddf['request_day_of_week_cos'] = np.cos(2 * np.pi * ddf['request_dayofweek'] / 7)

# categorical features
ddf['request_season'] = ddf['request_datetime'].apply(get_season)
ddf["weekend"] = ddf["request_dayofweek"].isin([5, 6]).astype(int)

del ddf['request_dayofweek']
del ddf['request_datetime']
del ddf['request_hour']
del ddf['request_minute']

### Modeling

In [None]:
ddf.columns

In [None]:
# apply 1he to pulocationid

features = ['pulocationid', 'request_sin_hour', 'request_cos_hour',
       'request_sin_minute', 'request_cos_minute', 'request_day_of_week_sin',
       'request_day_of_week_cos', 'request_season', 'weekend']
X = ddf[features]
y = ddf['wait_time']  

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)
# ignore indexs, just have matrices / arrays
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [None]:
# one hot encode categorical features
categorical_cols = ['pulocationid', 'request_season']
X_train, X_test = one_hot_encode(X_train, X_test, categorical_cols)

In [None]:
X_train = X_train.persist()
for col in X_test.columns:
    try:
        X_test[col] = X_test[col].persist()
    except Exception as e:
        print(f"Error persisting column {col}: {e}")
y_train = y_train.persist()
y_test = y_test.persist()

In [None]:
# Use incremental training for large data
model = Incremental(estimator=SGDRegressor())
model.fit(X_train, y_train)

In [None]:
X_train

In [None]:
X_test

In [None]:
# Predict
y_pred = model.predict(X_test).compute()

# Evaluate
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test.compute(), y_pred)
print(f'MSE: {mse}')

In [None]:
X_test.compute().head()