# Goal of the ML project 
- Dataset: gira bike sharing system data

- Time series and  Forecasting
- Apply techniques:
    - LSVM
    - ARIMA/ Linear Regression

# Data Selection

## Import libraries

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

## Import Functions

### dslabs_functions

In [None]:
%run "scripts/dslabs_functions.py"

dslabs_functions lodaded


### data functions

In [None]:
%run "scripts/data_functions.py"


dslabs_functions lodaded
data_functions lodaded


## Load Files 

### Holidays Calendar in Lisbon 2022

In [69]:
filepath_holidays_lx_2022 = r'data/holidays-lisbon-2022.xlsx'
#filepath_weather_ny_d = r'data/open-meteo-new-york-daily-2024.csv'
df_holidays_lx = pd.read_excel(filepath_holidays_lx_2022, na_values="")
display(df_holidays_lx.head())

Unnamed: 0,date,holiday_lisbon
0,2022-01-01,1
1,2022-03-01,1
2,2022-04-15,1
3,2022-04-17,1
4,2022-04-25,1


### Weather file - Lisbon
- Weather info, hourly, from Lisbon for the year 2022
    - info: rain, precipitation, temperature,...
- This weather info was extracted from OpenMeteo
    - Link here - https://open-meteo.com/en/docs/historical-weather-api#latitude=38.7167&longitude=-9.1333&start_date=2019-01-01&end_date=2019-12-31&hourly=&daily=weather_code,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,daylight_duration,sunshine_duration,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration&timezone=Europe%2FLondon&models=

    

In [77]:
filepath_weather_lx_h = r'data/open-meteo-lisbon-hourly-2022.csv'
#filepath_weather_ny_d = r'data/open-meteo-new-york-daily-2024.csv'

In [78]:
df_weather_lx_h = pd.read_csv(filepath_weather_lx_h,sep=';', na_values="")
display(df_weather_lx_h.head())

Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),dew_point_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),snow_depth (m),weather_code (wmo code),...,wind_direction_100m (°),wind_gusts_10m (km/h),soil_temperature_0_to_7cm (°C),soil_temperature_7_to_28cm (°C),soil_temperature_28_to_100cm (°C),soil_temperature_100_to_255cm (°C),soil_moisture_0_to_7cm (m³/m³),soil_moisture_7_to_28cm (m³/m³),soil_moisture_28_to_100cm (m³/m³),soil_moisture_100_to_255cm (m³/m³)
0,2019-01-01T00:00,9.9,85,7.5,8.1,0.0,0.0,0.0,0.0,1,...,49,13.3,10.0,11.9,13.6,17.1,0.339,0.35,0.36,0.3
1,2019-01-01T01:00,9.2,87,7.1,7.3,0.0,0.0,0.0,0.0,0,...,46,15.1,9.7,11.8,13.6,17.1,0.339,0.35,0.36,0.3
2,2019-01-01T02:00,8.9,88,6.9,6.9,0.0,0.0,0.0,0.0,0,...,42,15.8,9.5,11.7,13.6,17.1,0.339,0.35,0.36,0.3
3,2019-01-01T03:00,8.6,88,6.7,6.5,0.0,0.0,0.0,0.0,0,...,40,16.9,9.3,11.6,13.6,17.1,0.339,0.35,0.36,0.3
4,2019-01-01T04:00,8.4,88,6.5,6.0,0.0,0.0,0.0,0.0,0,...,44,19.1,9.1,11.4,13.6,17.1,0.339,0.35,0.36,0.3


### Main file - Gira bike sharing system data
- Data on each gira stations in Lisbon, regarding available capacity by minute, between the period of 1/1/22 and 31/6/22.
    - Contains info on:
        - desig_comercial - Name of the gira bike station in Lisbon
        - num_bicicletas - Number of docked bikes in the station (station load)
        - num_docas - number of docks in the station (station capacity)
        - position - longitude and latitute points of the gira bike stations
        - entity_ts - time stamp of each record regarding the station load at each minute
        - estado - statu of the gira bike station, can be active or in repair
    - Data provided on dados.cm-lisboa website
        - link here - https://dados.cm-lisboa.pt/dataset/gira-bicicletas-de-lisboa-historico

In [None]:
filepath = 'data/estacoes-gira-1-semestre-2022.csv'

test_data = False
# Load the data

if test_data:
    df = pd.read_csv(filepath, na_values="")
    df: Dataframe = df.sample(frac = 0.1,sep = ',', replace = False)
else:
    df = pd.read_csv(filepath, sep =',', na_values="")
display(df)

Unnamed: 0,desigcomercial,numbicicletas,numdocas,position,entity_ts,estado
0,417 - Av. Duque de Ávila / Jardim Arco Do Cego,3,23,"{""coordinates"":[-9.142703,38.735352],""type"":""P...",2022-01-01T13:38:03.130Z,active
1,446 - Av. República / Interface Entrecampos,25,40,"{""coordinates"":[-9.14773,38.74456],""type"":""Poi...",2022-01-01T13:38:04.625Z,active
2,416 - Av. República / Avenida Duque de Ávila,6,16,"{""coordinates"":[-9.14553,38.735304],""type"":""Po...",2022-01-01T13:38:03.751Z,active
3,103 - Jardim da Água,16,20,"{""coordinates"":[-9.095019,38.761218],""type"":""P...",2022-01-01T13:38:04.204Z,active
4,415 - Av. Duque de Ávila / Av. Conde Valbom,9,20,"{""coordinates"":[-9.148443,38.735188],""type"":""P...",2022-01-01T13:38:04.179Z,active
...,...,...,...,...,...,...
1555391,486 - Rua Hermano Neves / Rua José Escada,2,14,"{""coordinates"":[-9.16985,38.75954],""type"":""Poi...",2022-06-30T10:33:28.035Z,active
1555392,417 - Av. Duque de Ávila / Jardim Arco Do Cego,9,23,"{""coordinates"":[-9.142703,38.735352],""type"":""P...",2022-06-30T10:33:25.612Z,active
1555393,449 - Av. 5 de Outubro / Rua da Cruz Vermelha,0,14,"{""coordinates"":[-9.150085,38.746836],""type"":""P...",2022-06-30T10:13:36.074Z,active
1555394,479 - Rua Professor Oliveira Marques,13,14,"{""coordinates"":[-9.15724,38.75491],""type"":""Poi...",2022-06-30T10:13:37.004Z,active


In [68]:
import pandas as pd

# File paths
filepaths = ['data/estacoes-gira-1-semestre-2022.csv', 'data/estacoes-gira-2-semestre-2022.csv']

test_data = False

# Load the data
df_list = [pd.read_csv(file, sep=',', na_values="") for file in filepaths]

df = pd.concat(df_list, ignore_index=True)

# Sample if test_data is True
if test_data:
    df = df.sample(frac=0.1, replace=False)

display(df)


Unnamed: 0,desigcomercial,numbicicletas,numdocas,position,entity_ts,estado
0,417 - Av. Duque de Ávila / Jardim Arco Do Cego,3,23,"{""coordinates"":[-9.142703,38.735352],""type"":""P...",2022-01-01T13:38:03.130Z,active
1,446 - Av. República / Interface Entrecampos,25,40,"{""coordinates"":[-9.14773,38.74456],""type"":""Poi...",2022-01-01T13:38:04.625Z,active
2,416 - Av. República / Avenida Duque de Ávila,6,16,"{""coordinates"":[-9.14553,38.735304],""type"":""Po...",2022-01-01T13:38:03.751Z,active
3,103 - Jardim da Água,16,20,"{""coordinates"":[-9.095019,38.761218],""type"":""P...",2022-01-01T13:38:04.204Z,active
4,415 - Av. Duque de Ávila / Av. Conde Valbom,9,20,"{""coordinates"":[-9.148443,38.735188],""type"":""P...",2022-01-01T13:38:04.179Z,active
...,...,...,...,...,...,...
3938287,479 - Rua Professor Oliveira Marques,10,13,"{""coordinates"":[-9.15724,38.75491],""type"":""Poi...",2023-02-16T11:47:16.000Z,active
3938288,513 - Rua Guiomar Torresão/Metro de Carnide,0,20,"{""coordinates"":[-9.19216,38.758691],""type"":""Po...",2023-02-16T11:47:16.000Z,repair
3938289,410 - Rua da Mesquita / Universidade Nova de L...,0,40,"{""coordinates"":[-9.16015,38.73492],""type"":""Poi...",2023-02-16T12:27:31.000Z,repair
3938290,410 - Rua da Mesquita / Universidade Nova de L...,0,40,"{""coordinates"":[-9.16015,38.73492],""type"":""Poi...",2023-02-16T12:07:30.000Z,repair


In [56]:
df.shape

(3938292, 6)

# Data Cleaning/Transformation

- Feature Engineering steps
    - In this project, we have the main dataset (df) with capacity data for each gira bike station. In order to enrich the analysis, we decided to cross join this data with hourly meteorological conditions (from openMeteo) in Lisbon, to understand how the weather may influence the gira bike stations load over time. Lisbon's Holiday data was also included.

- Remove data from 2023 in the df main
- Add new features
    - Add holiday col
        - Add column is_holiday(0 or 1) that indicates if the day corresponds to a holiday date in 2022.
    -   Spacial:
        - longitude of the gira bike station
        - latitude of the gira bike station
    - Weather
        - weather_desc: The weather conditions during the ride (e.g., Overcast, Clear sky, Rainy).
        rain: The amount of rain (in mm) during the ride.
        - temperature: The temperature (in °C) during the ride.
    - Create col station load target:
        - station_target_load hourly (numbicicletas / numdocas) * 100
    - station_A_load hourly
    - station_B_load hourly
    - station_C_load hourly
    - station_D_load hourly    

- Agreggation:
    - Aggregate the data hourly to have less rows
- Rename original cols from main df - citi bike
- Remove id columns: make sure to remove id columns from the df
- Create dataset for data exploration:
    - normal dataset with
- Create dataset for time forecasting (1):
    - trasnform the dataset
    - create taget columns (station load for target station)
    - keep external variables (station_a, station_b, station_c, weather_info, is_holiday)
- Create dataset for time forecasting (2):
    - trasnform the dataset
    - create taget columns (station load for target station)
    - remove external variables

## Remove data from 2023 in the main df

In [57]:
df.head()

Unnamed: 0,desigcomercial,numbicicletas,numdocas,position,entity_ts,estado
0,417 - Av. Duque de Ávila / Jardim Arco Do Cego,3,23,"{""coordinates"":[-9.142703,38.735352],""type"":""P...",2022-01-01T13:38:03.130Z,active
1,446 - Av. República / Interface Entrecampos,25,40,"{""coordinates"":[-9.14773,38.74456],""type"":""Poi...",2022-01-01T13:38:04.625Z,active
2,416 - Av. República / Avenida Duque de Ávila,6,16,"{""coordinates"":[-9.14553,38.735304],""type"":""Po...",2022-01-01T13:38:03.751Z,active
3,103 - Jardim da Água,16,20,"{""coordinates"":[-9.095019,38.761218],""type"":""P...",2022-01-01T13:38:04.204Z,active
4,415 - Av. Duque de Ávila / Av. Conde Valbom,9,20,"{""coordinates"":[-9.148443,38.735188],""type"":""P...",2022-01-01T13:38:04.179Z,active


In [72]:
# Convert 'entity_ts' column in df to datetime
df['entity_ts'] = pd.to_datetime(df['entity_ts'].str[:-5])  # Remove os últimos 5 caracteres se necessário
df = df[df['entity_ts'].dt.year != 2023]

In [73]:
df.shape

(3348370, 6)

## Add Holiday column to main df

|

In [74]:
# Convert 'date' column in df_holidays_lx to datetime
df_holidays_lx['date'] = pd.to_datetime(df_holidays_lx['date'])


# Extract hour and day from 'entity_ts'
df['entity_ts_hour'] = df['entity_ts'].dt.floor('H')  # Extrai a hora (arredondada para baixo)
df['entity_ts_day'] = df['entity_ts'].dt.normalize()  # Extrai a data (sem horário)

# Realize o merge
df = df.merge(df_holidays_lx, left_on='entity_ts_day', right_on='date', how='left')
df['holiday_lisbon'] = df['holiday_lisbon'].fillna(0)

df.drop(columns=['date'], inplace=True)

# Exiba o DataFrame resultante
display(df.head())

  df['entity_ts_hour'] = df['entity_ts'].dt.floor('H')  # Extrai a hora (arredondada para baixo)


Unnamed: 0,desigcomercial,numbicicletas,numdocas,position,entity_ts,estado,entity_ts_hour,entity_ts_day,holiday_lisbon
0,417 - Av. Duque de Ávila / Jardim Arco Do Cego,3,23,"{""coordinates"":[-9.142703,38.735352],""type"":""P...",2022-01-01 13:38:03,active,2022-01-01 13:00:00,2022-01-01,1.0
1,446 - Av. República / Interface Entrecampos,25,40,"{""coordinates"":[-9.14773,38.74456],""type"":""Poi...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,1.0
2,416 - Av. República / Avenida Duque de Ávila,6,16,"{""coordinates"":[-9.14553,38.735304],""type"":""Po...",2022-01-01 13:38:03,active,2022-01-01 13:00:00,2022-01-01,1.0
3,103 - Jardim da Água,16,20,"{""coordinates"":[-9.095019,38.761218],""type"":""P...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,1.0
4,415 - Av. Duque de Ávila / Av. Conde Valbom,9,20,"{""coordinates"":[-9.148443,38.735188],""type"":""P...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,1.0


In [75]:
df['holiday_lisbon'].value_counts()

holiday_lisbon
0.0    3207409
1.0     140961
Name: count, dtype: int64

## Add longituded and latitude of the station column

In [63]:
df.head()

Unnamed: 0,desigcomercial,numbicicletas,numdocas,position,entity_ts,estado,entity_ts_hour,entity_ts_day,date,holiday_lisbon
0,417 - Av. Duque de Ávila / Jardim Arco Do Cego,3,23,"{""coordinates"":[-9.142703,38.735352],""type"":""P...",2022-01-01 13:38:03,active,2022-01-01 13:00:00,2022-01-01,2022-01-01,1.0
1,446 - Av. República / Interface Entrecampos,25,40,"{""coordinates"":[-9.14773,38.74456],""type"":""Poi...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,2022-01-01,1.0
2,416 - Av. República / Avenida Duque de Ávila,6,16,"{""coordinates"":[-9.14553,38.735304],""type"":""Po...",2022-01-01 13:38:03,active,2022-01-01 13:00:00,2022-01-01,2022-01-01,1.0
3,103 - Jardim da Água,16,20,"{""coordinates"":[-9.095019,38.761218],""type"":""P...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,2022-01-01,1.0
4,415 - Av. Duque de Ávila / Av. Conde Valbom,9,20,"{""coordinates"":[-9.148443,38.735188],""type"":""P...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,2022-01-01,1.0


In [None]:
import json
# Convert the position column to JSON format and extract lat and long
df['position'] = df['position'].apply(json.loads)

df['long_station'] = df['position'].apply(lambda x: x['coordinates'][0])
df['lat_station'] = df['position'].apply(lambda x: x['coordinates'][1])
df.head()


Unnamed: 0,desigcomercial,numbicicletas,numdocas,position,entity_ts,estado,entity_ts_hour,entity_ts_day,holiday_lisbon,long_station,lat_station
0,417 - Av. Duque de Ávila / Jardim Arco Do Cego,3,23,"{'coordinates': [-9.142703, 38.735352], 'type'...",2022-01-01 13:38:03,active,2022-01-01 13:00:00,2022-01-01,1.0,-9.142703,38.735352
1,446 - Av. República / Interface Entrecampos,25,40,"{'coordinates': [-9.14773, 38.74456], 'type': ...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,1.0,-9.14773,38.74456
2,416 - Av. República / Avenida Duque de Ávila,6,16,"{'coordinates': [-9.14553, 38.735304], 'type':...",2022-01-01 13:38:03,active,2022-01-01 13:00:00,2022-01-01,1.0,-9.14553,38.735304
3,103 - Jardim da Água,16,20,"{'coordinates': [-9.095019, 38.761218], 'type'...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,1.0,-9.095019,38.761218
4,415 - Av. Duque de Ávila / Av. Conde Valbom,9,20,"{'coordinates': [-9.148443, 38.735188], 'type'...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,1.0,-9.148443,38.735188


In [None]:
df_weather_lx_h.head()

Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),dew_point_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),snow_depth (m),weather_code (wmo code),...,wind_direction_100m (°),wind_gusts_10m (km/h),soil_temperature_0_to_7cm (°C),soil_temperature_7_to_28cm (°C),soil_temperature_28_to_100cm (°C),soil_temperature_100_to_255cm (°C),soil_moisture_0_to_7cm (m³/m³),soil_moisture_7_to_28cm (m³/m³),soil_moisture_28_to_100cm (m³/m³),soil_moisture_100_to_255cm (m³/m³)
0,2019-01-01T00:00,9.9,85,7.5,8.1,0.0,0.0,0.0,0.0,1,...,49,13.3,10.0,11.9,13.6,17.1,0.339,0.35,0.36,0.3
1,2019-01-01T01:00,9.2,87,7.1,7.3,0.0,0.0,0.0,0.0,0,...,46,15.1,9.7,11.8,13.6,17.1,0.339,0.35,0.36,0.3
2,2019-01-01T02:00,8.9,88,6.9,6.9,0.0,0.0,0.0,0.0,0,...,42,15.8,9.5,11.7,13.6,17.1,0.339,0.35,0.36,0.3
3,2019-01-01T03:00,8.6,88,6.7,6.5,0.0,0.0,0.0,0.0,0,...,40,16.9,9.3,11.6,13.6,17.1,0.339,0.35,0.36,0.3
4,2019-01-01T04:00,8.4,88,6.5,6.0,0.0,0.0,0.0,0.0,0,...,44,19.1,9.1,11.4,13.6,17.1,0.339,0.35,0.36,0.3
5,2019-01-01T05:00,8.1,89,6.3,5.6,0.0,0.0,0.0,0.0,0,...,46,19.4,9.0,11.3,13.6,17.1,0.339,0.35,0.36,0.3
6,2019-01-01T06:00,7.8,89,6.1,5.2,0.0,0.0,0.0,0.0,0,...,48,20.9,8.9,11.2,13.6,17.1,0.339,0.35,0.36,0.3
7,2019-01-01T07:00,7.5,88,5.7,4.9,0.0,0.0,0.0,0.0,0,...,47,21.6,8.7,11.1,13.5,17.1,0.339,0.35,0.36,0.3
8,2019-01-01T08:00,7.4,87,5.4,4.9,0.0,0.0,0.0,0.0,0,...,46,20.5,8.5,11.0,13.5,17.1,0.339,0.35,0.36,0.3
9,2019-01-01T09:00,8.6,80,5.4,5.8,0.0,0.0,0.0,0.0,2,...,46,22.0,8.7,10.8,13.5,17.1,0.339,0.35,0.36,0.3


## Add weather info cols in main df

- Weather info, hourly, from Lisbon for the year 2022
    - info: rain, precipitation, temperature,...
- This weather info was extracted from OpenMeteo
    - Link here -https://open-meteo.com/en/docs/historical-weather-api#latitude=38.7167&longitude=-9.1333&start_date=2019-01-01&end_date=2019-12-31&hourly=&daily=weather_code,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,daylight_duration,sunshine_duration,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration&timezone=Europe%2FLondon&models=

Info on dataset columns:
- snowfall = 5 cm (new snow that felt in a specific hour)
- snow_depth = 15 cm (total snow accumulated in the floor in a specific hour)
- rain_sum (mm) = Total sum of liquid rain (excludes snow, hail, etc.).
- precipitation_sum (mm) = Total sum of all precipitation (includes rain, snow, hail, etc.)
- cloud_cover (%): Represents the total percentage of the sky covered by clouds at all altitudes, without distinguishing between low, middle, or high
- cloud_cover_low (%): Represents the percentage of cloud cover specifically at low altitudes, typically below 2 km.- cloud_cover_mid (%) = Percentage of cloud cover at mid altitudes (typically between 2 km and 6 km).
- cloud_cover_high (%) = Percentage of cloud cover at high altitudes (typically above 6 km).
- wind_speed_10m (km/h)= Represents the wind speed measured at 10 meters above ground level, in kilometers per hour (km/h). This is typically used to assess the strength of the wind near the surface.
- weather_code (wmo code)	- World meteorological code for that hour 
    - cloudly, sunny, mist

    

In [82]:
df_weather_lx_h.columns

Index(['time', 'temperature_2m (°C)', 'relative_humidity_2m (%)',
       'dew_point_2m (°C)', 'apparent_temperature (°C)', 'precipitation (mm)',
       'rain (mm)', 'snowfall (cm)', 'snow_depth (m)',
       'weather_code (wmo code)', 'pressure_msl (hPa)',
       'surface_pressure (hPa)', 'cloud_cover (%)', 'cloud_cover_low (%)',
       'cloud_cover_mid (%)', 'cloud_cover_high (%)',
       'et0_fao_evapotranspiration (mm)', 'vapour_pressure_deficit (kPa)',
       'wind_speed_10m (km/h)', 'wind_speed_100m (km/h)',
       'wind_direction_10m (°)', 'wind_direction_100m (°)',
       'wind_gusts_10m (km/h)', 'soil_temperature_0_to_7cm (°C)',
       'soil_temperature_7_to_28cm (°C)', 'soil_temperature_28_to_100cm (°C)',
       'soil_temperature_100_to_255cm (°C)', 'soil_moisture_0_to_7cm (m³/m³)',
       'soil_moisture_7_to_28cm (m³/m³)', 'soil_moisture_28_to_100cm (m³/m³)',
       'soil_moisture_100_to_255cm (m³/m³)'],
      dtype='object')

In [80]:
df_weather_lx_h.head()

Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),dew_point_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),snow_depth (m),weather_code (wmo code),...,wind_direction_100m (°),wind_gusts_10m (km/h),soil_temperature_0_to_7cm (°C),soil_temperature_7_to_28cm (°C),soil_temperature_28_to_100cm (°C),soil_temperature_100_to_255cm (°C),soil_moisture_0_to_7cm (m³/m³),soil_moisture_7_to_28cm (m³/m³),soil_moisture_28_to_100cm (m³/m³),soil_moisture_100_to_255cm (m³/m³)
0,2019-01-01T00:00,9.9,85,7.5,8.1,0.0,0.0,0.0,0.0,1,...,49,13.3,10.0,11.9,13.6,17.1,0.339,0.35,0.36,0.3
1,2019-01-01T01:00,9.2,87,7.1,7.3,0.0,0.0,0.0,0.0,0,...,46,15.1,9.7,11.8,13.6,17.1,0.339,0.35,0.36,0.3
2,2019-01-01T02:00,8.9,88,6.9,6.9,0.0,0.0,0.0,0.0,0,...,42,15.8,9.5,11.7,13.6,17.1,0.339,0.35,0.36,0.3
3,2019-01-01T03:00,8.6,88,6.7,6.5,0.0,0.0,0.0,0.0,0,...,40,16.9,9.3,11.6,13.6,17.1,0.339,0.35,0.36,0.3
4,2019-01-01T04:00,8.4,88,6.5,6.0,0.0,0.0,0.0,0.0,0,...,44,19.1,9.1,11.4,13.6,17.1,0.339,0.35,0.36,0.3


In [None]:
#Let's use the hourly dataset to have detailed weather info

#convert time to datetime format
df_weather_lx_h['time'] = pd.to_datetime(df_weather_lx_h['time'])

#drop meteo columns that are not relevant for the analysis
df_weather_lx_h.drop(columns = [
    'apparent_temperature (°C)',
    'relative_humidity_2m (%)',
    'dew_point_2m (°C)',
    'et0_fao_evapotranspiration (mm)',
    'precipitation (mm)',
    'vapour_pressure_deficit (kPa)',
    'wind_speed_100m (km/h)',
    'snow_depth (m)',
    'cloud_cover (%)',
    'cloud_cover_mid (%)',
    'cloud_cover_high (%)',
    'pressure_msl (hPa)',
    'surface_pressure (hPa)',
    'wind_direction_10m (°)',
    'wind_direction_100m (°)',
    'wind_gusts_10m (km/h)'],
inplace=True)

display(df_weather_lx_h.head())

Unnamed: 0,time,temperature_2m (°C),rain (mm),snowfall (cm),weather_code (wmo code),cloud_cover_low (%),wind_speed_10m (km/h)
0,2024-01-01 00:00:00,1.6,0.0,0.0,3,94,8.9
1,2024-01-01 01:00:00,2.6,0.0,0.0,3,100,11.4
2,2024-01-01 02:00:00,2.7,0.0,0.0,3,100,9.7
3,2024-01-01 03:00:00,2.5,0.0,0.0,2,27,8.1
4,2024-01-01 04:00:00,0.5,0.0,0.0,3,5,7.5
...,...,...,...,...,...,...,...
8779,2024-12-31 19:00:00,8.0,0.0,0.0,3,0,10.7
8780,2024-12-31 20:00:00,7.9,0.0,0.0,3,6,10.8
8781,2024-12-31 21:00:00,7.2,3.1,0.0,63,18,9.7
8782,2024-12-31 22:00:00,7.2,2.8,0.0,63,36,12.2


##### Get list/dictionairy with the description of WMO Weather codes
- source: WMO- world meteorological organization

In [None]:
# Dictionary with WMO weather codes and descriptions
wmo_weather_codes = {
    0: "Clear sky",
    1: "Mainly clear",
    2: "Partly cloudy",
    3: "Overcast",
    45: "Fog",
    48: "Depositing rime fog",
    51: "Drizzle: Light",
    53: "Drizzle: Moderate",
    55: "Drizzle: Dense",
    56: "Freezing drizzle: Light",
    57: "Freezing drizzle: Dense",
    61: "Rain: Slight",
    63: "Rain: Moderate",
    65: "Rain: Heavy",
    66: "Freezing rain: Light",
    67: "Freezing rain: Heavy",
    71: "Snowfall: Slight",
    73: "Snowfall: Moderate",
    75: "Snowfall: Heavy",
    77: "Snow grains",
    80: "Rain showers: Slight",
    81: "Rain showers: Moderate",
    82: "Rain showers: Violent",
    85: "Snow showers: Slight",
    86: "Snow showers: Heavy",
    95: "Thunderstorm: Slight or moderate",
    96: "Thunderstorm with slight hail",
    99: "Thunderstorm with heavy hail"
}

df_wmo_weather_codes =pd.DataFrame(list(wmo_weather_codes.items()), columns = ['wmo_code', 'wmo_weather_desc'])
display(df_wmo_weather_codes)

Unnamed: 0,wmo_code,wmo_weather_desc
0,0,Clear sky
1,1,Mainly clear
2,2,Partly cloudy
3,3,Overcast
4,45,Fog
5,48,Depositing rime fog
6,51,Drizzle: Light
7,53,Drizzle: Moderate
8,55,Drizzle: Dense
9,56,Freezing drizzle: Light


#### Add weather description to the weather dataset
- Cloud Cover
- Overcast → The sky is completely covered with clouds (100% cloud cover).
- Partly cloudy → A mix of clouds and clear sky, typically 30-70% cloud cover.
- Mainly clear → Mostly clear with a few scattered clouds (10-30% cloud cover).
- Clear sky → No significant clouds, nearly 0% cloud cover.
- ❄️ Snowfall
- Snowfall: Slight → Light snowflakes falling, minimal accumulation.
- Snowfall: Moderate → Steady snowfall with noticeable accumulation.
- Snowfall: Heavy → Intense snowfall with rapid accumulation, possibly reducing visibility.
- 🌧 Drizzle (Light, fine rain with small droplets)
- Drizzle: Light → A few small droplets falling intermittently, barely wetting the ground.
- Drizzle: Moderate → Continuous fine rain, making surfaces damp.
- Drizzle: Dense → Heavy drizzle, creating persistent wet conditions, but not forming puddles.
- 🌦 Rain (Heavier precipitation than drizzle)
- Rain: Slight → Light rain with small raindrops and little accumulation.
- Rain: Moderate → Steady rain that wets the ground and can form small puddles.
- Rain: Heavy → Intense rainfall, quickly accumulating, possibly causing water runoff.

In [None]:
df_weather_ny_h = df_weather_ny_h.merge(
    df_wmo_weather_codes,
    left_on='weather_code (wmo code)',
    right_on= 'wmo_code', 
    how = 'left')

df_weather_ny_h.rename(columns={
    'temperature_2m (°C)': 'temperature_2m',
    'rain (mm)': 'rain_mm',
    'snowfall (cm)': 'snowfall_cm',
    'weather_code (wmo code)': 'wmo_weather_code',
    'cloud_cover_low (%)': 'cloud_cover_low_pct',
    'wind_speed_10m (km/h)': 'wind_speed_10m',
}, inplace=True)

df_weather_ny_h.drop(columns = ['wmo_code'], inplace=True)

display(df_weather_ny_h.head(10))

Unnamed: 0,time,temperature_2m,rain_mm,snowfall_cm,wmo_weather_code,cloud_cover_low_pct,wind_speed_10m,wmo_weather_desc
0,2024-01-01 00:00:00,1.6,0.0,0.0,3,94,8.9,Overcast
1,2024-01-01 01:00:00,2.6,0.0,0.0,3,100,11.4,Overcast
2,2024-01-01 02:00:00,2.7,0.0,0.0,3,100,9.7,Overcast
3,2024-01-01 03:00:00,2.5,0.0,0.0,2,27,8.1,Partly cloudy
4,2024-01-01 04:00:00,0.5,0.0,0.0,3,5,7.5,Overcast
5,2024-01-01 05:00:00,-0.0,0.0,0.0,3,99,8.6,Overcast
6,2024-01-01 06:00:00,1.2,0.0,0.0,3,100,3.7,Overcast
7,2024-01-01 07:00:00,1.7,0.0,0.0,3,100,5.0,Overcast
8,2024-01-01 08:00:00,3.0,0.0,0.0,3,100,4.7,Overcast
9,2024-01-01 09:00:00,3.6,0.0,0.0,3,100,5.0,Overcast


#### Merge weather data with main df city_bike

In [None]:
#create a new column in the df dataframe to have date until hour

df['started_at_hour'] = df['started_at'].dt.floor('H')  # Extract the hour part (rounded down)

display(df.head())

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,start_borough,end_borough,day_of_month,hour,day_of_week,is_weekend,time_of_day,ride_duration_sec,ride_duration_min,started_at_hour
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,Manhattan,Manhattan,27,13,Saturday,1,afternoon,549.689,9.161483,2024-04-27 13:00:00
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,Manhattan,Manhattan,25,15,Thursday,0,afternoon,278.366,4.639433,2024-04-25 15:00:00
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,Manhattan,Manhattan,6,11,Saturday,1,morning,411.949,6.865817,2024-04-06 11:00:00
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,Brooklyn,Brooklyn,6,16,Saturday,1,afternoon,137.349,2.28915,2024-04-06 16:00:00
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,Manhattan,Manhattan,10,17,Wednesday,0,evening,476.871,7.94785,2024-04-10 17:00:00


In [None]:
#merge the df main with the weather data df on time column
df = df.merge(
    df_weather_ny_h,
    left_on='started_at_hour',
    right_on='time',
    how = 'left'
)

#drop time col from the weather df as is not relevant anymore
df = df.drop(columns=['time','started_at_hour'])

display(df.head())
display(df.shape)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,time_of_day,ride_duration_sec,ride_duration_min,temperature_2m,rain_mm,snowfall_cm,wmo_weather_code,cloud_cover_low_pct,wind_speed_10m,wmo_weather_desc
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,afternoon,549.689,9.161483,14.3,0.0,0.0,3,16,25.3,Overcast
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,afternoon,278.366,4.639433,11.2,0.0,0.0,0,0,13.7,Clear sky
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,morning,411.949,6.865817,9.6,0.0,0.0,3,95,27.9,Overcast
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,afternoon,137.349,2.28915,10.7,0.0,0.0,3,11,25.4,Overcast
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,evening,476.871,7.94785,15.0,0.0,0.0,2,44,10.7,Partly cloudy


(3193597, 29)

- Check if we have any value in df with null value in weather description

In [None]:
df_null_weather_desc = df['wmo_weather_desc'].isnull().sum()
display(df_null_weather_desc)


0

## Add col target - station load rate

## Agreggation hour level

## Rename original cols from main df 

## Remove id columns: make sure to remove id columns from the df

# Create dataset for data exploration:

# Create dataset for time forecasting (1)

# Create dataset for time forecasting (2) - excluding external variables