### Big data course project
<strong>T7: Forecasting demand -- daily aggregation of the data</strong>

Jovana Videnovic & Haris Kupinic

In [None]:
!hostnamectl

In [None]:
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
from pathlib import Path
import pandas as pd
import os
import numpy as np
from sklearn.neighbors import BallTree
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.compute as pc
from datetime import timedelta
from dask_ml.linear_model import LinearRegression
from dask_ml.ensemble import BlockwiseVotingRegressor
import dask.array as da
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [None]:
cluster = LocalCluster(n_workers=2, threads_per_worker=2, memory_limit='64GB')
client = Client(cluster)

In [None]:
part_data_path = Path("/d/hpc/projects/FRI/bigdata/students/jv8043/partitioned_data")

In [None]:
y_ddf = dd.read_parquet(part_data_path / "yellow", engine="pyarrow", assume_missing=True)[['pickup_datetime']]
g_ddf = dd.read_parquet(part_data_path / "green", engine="pyarrow", assume_missing=True)[['pickup_datetime']]
fhv_ddf = dd.read_parquet(part_data_path / "fhv", engine="pyarrow", assume_missing=True)[['pickup_datetime']]
fhvhv_ddf = dd.read_parquet(part_data_path / "fhvhv", engine="pyarrow", assume_missing=True)[['pickup_datetime']]

In [None]:
y_ddf["pickup_datetime"] = dd.to_datetime(y_ddf["pickup_datetime"])
g_ddf["pickup_datetime"] = dd.to_datetime(g_ddf["pickup_datetime"])
fhv_ddf["pickup_datetime"] = dd.to_datetime(fhv_ddf["pickup_datetime"])
fhvhv_ddf["pickup_datetime"] = dd.to_datetime(fhvhv_ddf["pickup_datetime"])

In [None]:
y_ddf = y_ddf[
    ((y_ddf["pickup_datetime"] >= "2019-03-01") & (y_ddf["pickup_datetime"] < "2021-03-01")) |
    (y_ddf["pickup_datetime"] >= "2021-06-01")
]

g_ddf = g_ddf[
    ((g_ddf["pickup_datetime"] >= "2019-03-01") & (g_ddf["pickup_datetime"] < "2021-03-01")) |
    (g_ddf["pickup_datetime"] >= "2021-06-01")
]

fhv_ddf = fhv_ddf[
    ((fhv_ddf["pickup_datetime"] >= "2019-03-01") & (fhv_ddf["pickup_datetime"] < "2021-03-01")) |
    (fhv_ddf["pickup_datetime"] >= "2021-06-01")
]

fhvhv_ddf = fhvhv_ddf[
    ((fhvhv_ddf["pickup_datetime"] >= "2019-03-01") & (fhvhv_ddf["pickup_datetime"] < "2021-03-01")) |
    (fhvhv_ddf["pickup_datetime"] >= "2021-06-01")
]


In [None]:
# leave only the date part of the datetime
y_ddf["pickup_datetime"] = y_ddf["pickup_datetime"].dt.floor("D")
g_ddf["pickup_datetime"] = g_ddf["pickup_datetime"].dt.floor("D")
fhv_ddf["pickup_datetime"] = fhv_ddf["pickup_datetime"].dt.floor("D")
fhvhv_ddf["pickup_datetime"] = fhvhv_ddf["pickup_datetime"].dt.floor("D")

In [None]:
y_ddf = y_ddf.set_index('pickup_datetime', sorted=True)
g_ddf = g_ddf.set_index('pickup_datetime', sorted=True)
fhv_ddf = fhv_ddf.set_index('pickup_datetime', sorted=True)
fhvhv_ddf = fhvhv_ddf.set_index('pickup_datetime', sorted=True)

In [None]:
y_ddf = y_ddf.repartition(freq='1D')
g_ddf = g_ddf.repartition(freq='1D')
fhv_ddf = fhv_ddf.repartition(freq='1D')
fhvhv_ddf = fhvhv_ddf.repartition(freq='1D')

In [None]:
dc_y = y_ddf.resample('1D').size().reset_index().rename(columns={0: 'ride_count'})
dc_g = g_ddf.resample('1D').size().reset_index().rename(columns={0: 'ride_count'})
dc_fhv = fhv_ddf.resample('1D').size().reset_index().rename(columns={0: 'ride_count'})
dc_fhvhv = fhvhv_ddf.resample('1D').size().reset_index().rename(columns={0: 'ride_count'})

In [None]:
display(dc_y)

In [None]:
# convert them all to pandas DataFrames
dc_y = dc_y.compute()
dc_g = dc_g.compute()
dc_fhv = dc_fhv.compute()
dc_fhvhv = dc_fhvhv.compute()

In [None]:
dc_y.to_csv("dc_y.csv", index=False)
dc_g.to_csv("dc_g.csv", index=False)
dc_fhv.to_csv("dc_fhv.csv", index=False)
dc_fhvhv.to_csv("dc_fhvhv.csv", index=False)

In [None]:
dc_y["ride_count_y"] = dc_y["ride_count"]
dc_g["ride_count_g"] = dc_g["ride_count"]
dc_fhv["ride_count_fhv"] = dc_fhv["ride_count"]
dc_fhvhv["ride_count_fhvhv"] = dc_fhvhv["ride_count"]

del dc_y["ride_count"]
del dc_g["ride_count"]
del dc_fhv["ride_count"]
del dc_fhvhv["ride_count"]

In [None]:
# join them into one dataframe by summing the ride counts and renaming the whole sum to 'ride_count'
df = dc_y.merge(dc_g, on='pickup_datetime', suffixes=('', ''))
df = df.merge(dc_fhv, on='pickup_datetime', suffixes=('', ''))
df = df.merge(dc_fhvhv, on='pickup_datetime', suffixes=('', ''))
df['ride_count'] = df['ride_count_y'] + df['ride_count_g'] + df['ride_count_fhv'] + df['ride_count_fhvhv']
daily_counts = df[['pickup_datetime', 'ride_count']]

In [None]:
daily_counts.to_csv("daily_counts.csv", index=False)