### Big data course project
<strong>T5: External data</strong>

Jovana Videnovic & Haris Kupinic

In [115]:
!hostnamectl

 Static hostname: wn110.arnes.si
       Icon name: computer-server
         Chassis: server 🖳
      Machine ID: 228908d85e1f46e68968a5a0c9a3c7d5
         Boot ID: 8f247b33f9404ea59b84f5257f0997d2
Operating System: ]8;;https://almalinux.org/AlmaLinux 9.6 (Sage Margay)]8;;         
     CPE OS Name: cpe:/o:almalinux:almalinux:9::baseos
          Kernel: Linux 5.14.0-570.24.1.el9_6.x86_64
    Architecture: x86-64
Firmware Version: CFE118M


In [116]:
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
from pathlib import Path
import pandas as pd
import os
import numpy as np
from sklearn.neighbors import BallTree
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.compute as pc
from datetime import timedelta
from dask_ml.linear_model import LinearRegression
from dask_ml.ensemble import BlockwiseVotingRegressor
import dask.array as da
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [117]:
cluster = LocalCluster(n_workers=2, threads_per_worker=2, memory_limit='32GB')
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 40619 instead




In [118]:
data_path = "/d/hpc/projects/FRI/bigdata/students/jv8043/augmented_data_new/green/weather_scbah"

In [119]:
ddf = dd.read_parquet(data_path, engine="pyarrow", assume_missing=True)

In [120]:
ddf.columns

Index(['ratecodeid', 'pulocationid', 'dolocationid', 'passenger_count',
       'trip_distance', 'fare_amount', 'tip_amount', 'total_amount',
       'payment_type', 'trip_type', 'pickup_datetime', 'dropoff_datetime',
       'pickup_lat', 'pickup_lon', 'dropoff_lat', 'dropoff_lon',
       '__index_level_0__', 'awnd', 'prcp', 'snow', 'snwd', 'tmax', 'tmin',
       'closest_school_college_pickup', 'closest_ba_pickup',
       'closest_hotel_pickup', 'closest_school_college_dropoff',
       'closest_ba_dropoff', 'closest_hotel_dropoff'],
      dtype='object')

In [None]:
# take data up to 2019
# make ddf 'pickup_datetime' and 'dropoff_datetime' columns as datetime
ddf['pickup_datetime'] = dd.to_datetime(ddf['pickup_datetime'])
ddf['dropoff_datetime'] = dd.to_datetime(ddf['dropoff_datetime'])
# filter data for the year 2019
ddf = ddf[(ddf['pickup_datetime'] < '2020-01-01') & (ddf['pickup_datetime'] >= '2018-01-01')]

### Minimal setting

In [122]:
min_ddf = ddf[['pickup_datetime']]

In [123]:
# Set datetime index
min_ddf = min_ddf.set_index('pickup_datetime')

In [124]:
# Resample to get daily counts
daily_counts = min_ddf.resample('1D').size().reset_index()

In [125]:
daily_counts = daily_counts.rename(columns={0: 'ride_count'})

In [126]:
daily_counts['day_of_week'] = daily_counts['pickup_datetime'].dt.dayofweek
# add season column
def get_season(dt):
    month = dt.month
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    else:
        return 'autumn'
    
daily_counts['season'] = daily_counts['pickup_datetime'].apply(get_season)
daily_counts['day'] = daily_counts['pickup_datetime'].dt.day
daily_counts['week_in_month'] = (daily_counts['pickup_datetime'].dt.day - 1) // 7 + 1
daily_counts['day_of_year'] = daily_counts['pickup_datetime'].dt.dayofyear

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('pickup_datetime', 'object'))



In [127]:
daily_counts = daily_counts.set_index('pickup_datetime').compute().sort_index()

In [128]:
train = daily_counts.iloc[:-30]
test = daily_counts.iloc[-30:]

In [129]:
test.columns

Index(['ride_count', 'day_of_week', 'season', 'day', 'week_in_month',
       'day_of_year'],
      dtype='object')

In [130]:
def one_hot_encode(train_df, test_df, categorical_cols):
    train_encoded = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
    test_encoded = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)
    test_encoded = test_encoded.reindex(columns=train_encoded.columns, fill_value=0)
    return train_encoded, test_encoded

In [131]:
train, test = one_hot_encode(train, test, ['day_of_week', 'season'])

In [132]:
train.columns, test.columns

(Index(['ride_count', 'day', 'week_in_month', 'day_of_year', 'day_of_week_1',
        'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',
        'day_of_week_6', 'season_spring', 'season_summer', 'season_winter'],
       dtype='object'),
 Index(['ride_count', 'day', 'week_in_month', 'day_of_year', 'day_of_week_1',
        'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',
        'day_of_week_6', 'season_spring', 'season_summer', 'season_winter'],
       dtype='object'))

In [133]:
cols = train.columns.difference(['ride_count', 'pickup_datetime'])

In [134]:
cols

Index(['day', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
       'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'day_of_year',
       'season_spring', 'season_summer', 'season_winter', 'week_in_month'],
      dtype='object')

In [135]:
for c in cols:
    train[c] = train[c].astype('int')
    test[c] = test[c].astype('int')

In [136]:
# Convert to Dask arrays
# select all but 'ride_count' and 'pickup_datetime'
X_train = da.from_array(train[cols].values)
y_train = da.from_array(train['ride_count'].values)

X_test = da.from_array(test[cols].values)
y_test = da.from_array(test['ride_count'].values)

In [137]:
lr = BlockwiseVotingRegressor(
    estimator= LinearRegression(),
)
# Fit the model
lr.fit(X_train, y_train)

# Predict
y_pred = lr.predict(X_test)

In [138]:
rmse = np.sqrt(mean_squared_error(y_test.compute(), y_pred.compute()))
print(f"RMSE: {rmse:.2f}")

mae = mean_absolute_error(y_test.compute(), y_pred.compute())
print(f"MAE: {mae:.2f}")

RMSE: 9005.95
MAE: 8373.87
