# Goal of the ML project 
- Dataset: gira bike sharing system data

- Time series and  Forecasting
- Apply techniques:
    - LSVM
    - ARIMA/ Linear Regression

# Data Selection

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

## Import Functions

### dslabs_functions

In [None]:
%run "scripts/dslabs_functions.py"

dslabs_functions lodaded


### data functions

In [None]:
%run "scripts/data_functions.py"


dslabs_functions lodaded
data_functions lodaded


## Load Files 

### Holidays Calendar in Lisbon 2022

In [34]:
filepath_holidays_lx_2022 = r'data/holidays-lisbon-2022.xlsx'
#filepath_weather_ny_d = r'data/open-meteo-new-york-daily-2024.csv'
df_holidays_lx = pd.read_excel(filepath_holidays_lx_2022, na_values="")
display(df_holidays_lx.head())

Unnamed: 0,date,holiday_lisbon
0,2022-01-01,1
1,2022-03-01,1
2,2022-04-15,1
3,2022-04-17,1
4,2022-04-25,1


### Weather file - Lisbon
- Weather info, hourly, from Lisbon for the year 2022
    - info: rain, precipitation, temperature,...
- This weather info was extracted from OpenMeteo
    - Link here - https://open-meteo.com/en/docs/historical-weather-api#latitude=38.7167&longitude=-9.1333&start_date=2019-01-01&end_date=2019-12-31&hourly=&daily=weather_code,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,daylight_duration,sunshine_duration,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration&timezone=Europe%2FLondon&models=

    

In [15]:
filepath_weather_lx_h = r'data/open-meteo-lisbon-hourly-2022.csv'
#filepath_weather_ny_d = r'data/open-meteo-new-york-daily-2024.csv'

In [16]:
df_weather_ny_h = pd.read_csv(filepath_weather_lx_h,sep=';', na_values="")
display(df_weather_ny_h.head())

Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),dew_point_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),snow_depth (m),weather_code (wmo code),...,wind_direction_100m (°),wind_gusts_10m (km/h),soil_temperature_0_to_7cm (°C),soil_temperature_7_to_28cm (°C),soil_temperature_28_to_100cm (°C),soil_temperature_100_to_255cm (°C),soil_moisture_0_to_7cm (m³/m³),soil_moisture_7_to_28cm (m³/m³),soil_moisture_28_to_100cm (m³/m³),soil_moisture_100_to_255cm (m³/m³)
0,2019-01-01T00:00,9.9,85,7.5,8.1,0.0,0.0,0.0,0.0,1,...,49,13.3,10.0,11.9,13.6,17.1,0.339,0.35,0.36,0.3
1,2019-01-01T01:00,9.2,87,7.1,7.3,0.0,0.0,0.0,0.0,0,...,46,15.1,9.7,11.8,13.6,17.1,0.339,0.35,0.36,0.3
2,2019-01-01T02:00,8.9,88,6.9,6.9,0.0,0.0,0.0,0.0,0,...,42,15.8,9.5,11.7,13.6,17.1,0.339,0.35,0.36,0.3
3,2019-01-01T03:00,8.6,88,6.7,6.5,0.0,0.0,0.0,0.0,0,...,40,16.9,9.3,11.6,13.6,17.1,0.339,0.35,0.36,0.3
4,2019-01-01T04:00,8.4,88,6.5,6.0,0.0,0.0,0.0,0.0,0,...,44,19.1,9.1,11.4,13.6,17.1,0.339,0.35,0.36,0.3


### Main file - Gira bike sharing system data
- Data on each gira stations in Lisbon, regarding available capacity by minute, between the period of 1/1/22 and 31/6/22.
    - Contains info on:
        - desig_comercial - Name of the gira bike station in Lisbon
        - num_bicicletas - Number of docked bikes in the station (station load)
        - num_docas - number of docks in the station (station capacity)
        - position - longitude and latitute points of the gira bike stations
        - entity_ts - time stamp of each record regarding the station load at each minute
        - estado - statu of the gira bike station, can be active or in repair
    - Data provided on dados.cm-lisboa website
        - link here - https://dados.cm-lisboa.pt/dataset/gira-bicicletas-de-lisboa-historico

In [None]:
filepath = 'data/estacoes-gira-1-semestre-2022.csv'

test_data = False
# Load the data

if test_data:
    df = pd.read_csv(filepath, na_values="")
    df: Dataframe = df.sample(frac = 0.1,sep = ',', replace = False)
else:
    df = pd.read_csv(filepath, sep =',', na_values="")
display(df)

Unnamed: 0,desigcomercial,numbicicletas,numdocas,position,entity_ts,estado
0,417 - Av. Duque de Ávila / Jardim Arco Do Cego,3,23,"{""coordinates"":[-9.142703,38.735352],""type"":""P...",2022-01-01T13:38:03.130Z,active
1,446 - Av. República / Interface Entrecampos,25,40,"{""coordinates"":[-9.14773,38.74456],""type"":""Poi...",2022-01-01T13:38:04.625Z,active
2,416 - Av. República / Avenida Duque de Ávila,6,16,"{""coordinates"":[-9.14553,38.735304],""type"":""Po...",2022-01-01T13:38:03.751Z,active
3,103 - Jardim da Água,16,20,"{""coordinates"":[-9.095019,38.761218],""type"":""P...",2022-01-01T13:38:04.204Z,active
4,415 - Av. Duque de Ávila / Av. Conde Valbom,9,20,"{""coordinates"":[-9.148443,38.735188],""type"":""P...",2022-01-01T13:38:04.179Z,active
...,...,...,...,...,...,...
1555391,486 - Rua Hermano Neves / Rua José Escada,2,14,"{""coordinates"":[-9.16985,38.75954],""type"":""Poi...",2022-06-30T10:33:28.035Z,active
1555392,417 - Av. Duque de Ávila / Jardim Arco Do Cego,9,23,"{""coordinates"":[-9.142703,38.735352],""type"":""P...",2022-06-30T10:33:25.612Z,active
1555393,449 - Av. 5 de Outubro / Rua da Cruz Vermelha,0,14,"{""coordinates"":[-9.150085,38.746836],""type"":""P...",2022-06-30T10:13:36.074Z,active
1555394,479 - Rua Professor Oliveira Marques,13,14,"{""coordinates"":[-9.15724,38.75491],""type"":""Poi...",2022-06-30T10:13:37.004Z,active


In [55]:
import pandas as pd

# File paths
filepaths = ['data/estacoes-gira-1-semestre-2022.csv', 'data/estacoes-gira-2-semestre-2022.csv']

test_data = False

# Load the data
df_list = [pd.read_csv(file, sep=',', na_values="") for file in filepaths]

df = pd.concat(df_list, ignore_index=True)

# Sample if test_data is True
if test_data:
    df = df.sample(frac=0.1, replace=False)

display(df)


Unnamed: 0,desigcomercial,numbicicletas,numdocas,position,entity_ts,estado
0,417 - Av. Duque de Ávila / Jardim Arco Do Cego,3,23,"{""coordinates"":[-9.142703,38.735352],""type"":""P...",2022-01-01T13:38:03.130Z,active
1,446 - Av. República / Interface Entrecampos,25,40,"{""coordinates"":[-9.14773,38.74456],""type"":""Poi...",2022-01-01T13:38:04.625Z,active
2,416 - Av. República / Avenida Duque de Ávila,6,16,"{""coordinates"":[-9.14553,38.735304],""type"":""Po...",2022-01-01T13:38:03.751Z,active
3,103 - Jardim da Água,16,20,"{""coordinates"":[-9.095019,38.761218],""type"":""P...",2022-01-01T13:38:04.204Z,active
4,415 - Av. Duque de Ávila / Av. Conde Valbom,9,20,"{""coordinates"":[-9.148443,38.735188],""type"":""P...",2022-01-01T13:38:04.179Z,active
...,...,...,...,...,...,...
3938287,479 - Rua Professor Oliveira Marques,10,13,"{""coordinates"":[-9.15724,38.75491],""type"":""Poi...",2023-02-16T11:47:16.000Z,active
3938288,513 - Rua Guiomar Torresão/Metro de Carnide,0,20,"{""coordinates"":[-9.19216,38.758691],""type"":""Po...",2023-02-16T11:47:16.000Z,repair
3938289,410 - Rua da Mesquita / Universidade Nova de L...,0,40,"{""coordinates"":[-9.16015,38.73492],""type"":""Poi...",2023-02-16T12:27:31.000Z,repair
3938290,410 - Rua da Mesquita / Universidade Nova de L...,0,40,"{""coordinates"":[-9.16015,38.73492],""type"":""Poi...",2023-02-16T12:07:30.000Z,repair


In [56]:
df.shape

(3938292, 6)

# Data Cleaning/Transformation

- Feature Engineering steps
    - In this project, we have the main dataset (df) with capacity data for each gira bike station. In order to enrich the analysis, we decided to cross join this data with hourly meteorological conditions (from openMeteo) in Lisbon, to understand how the weather may influence the gira bike stations load over time. Lisbon's Holiday data was also included.

- Remove data from 2023 in the df main
- Add new features
    - Add holiday col
        - Add column is_holiday(0 or 1) that indicates if the day corresponds to a holiday date in 2022.
    -   Spacial:
        - longitude of the gira bike station
        - latitude of the gira bike station
    - Weather
        - weather_desc: The weather conditions during the ride (e.g., Overcast, Clear sky, Rainy).
        rain: The amount of rain (in mm) during the ride.
        - temperature: The temperature (in °C) during the ride.
    - Create col station load target:
        - station_target_load hourly (numbicicletas / numdocas) * 100
    - station_A_load hourly
    - station_B_load hourly
    - station_C_load hourly
    - station_D_load hourly    

- Agreggation:
    - Aggregate the data hourly to have less rows
- Rename original cols from main df - citi bike
- Remove id columns: make sure to remove id columns from the df
- Create dataset for data exploration:
    - normal dataset with
- Create dataset for time forecasting (1):
    - trasnform the dataset
    - create taget columns (station load for target station)
    - keep external variables (station_a, station_b, station_c, weather_info, is_holiday)
- Create dataset for time forecasting (2):
    - trasnform the dataset
    - create taget columns (station load for target station)
    - remove external variables

## Remove data from 2023 in the main df

In [57]:
df.head()

Unnamed: 0,desigcomercial,numbicicletas,numdocas,position,entity_ts,estado
0,417 - Av. Duque de Ávila / Jardim Arco Do Cego,3,23,"{""coordinates"":[-9.142703,38.735352],""type"":""P...",2022-01-01T13:38:03.130Z,active
1,446 - Av. República / Interface Entrecampos,25,40,"{""coordinates"":[-9.14773,38.74456],""type"":""Poi...",2022-01-01T13:38:04.625Z,active
2,416 - Av. República / Avenida Duque de Ávila,6,16,"{""coordinates"":[-9.14553,38.735304],""type"":""Po...",2022-01-01T13:38:03.751Z,active
3,103 - Jardim da Água,16,20,"{""coordinates"":[-9.095019,38.761218],""type"":""P...",2022-01-01T13:38:04.204Z,active
4,415 - Av. Duque de Ávila / Av. Conde Valbom,9,20,"{""coordinates"":[-9.148443,38.735188],""type"":""P...",2022-01-01T13:38:04.179Z,active


In [58]:
# Convert 'entity_ts' column in df to datetime
df['entity_ts'] = pd.to_datetime(df['entity_ts'].str[:-5])  # Remove os últimos 5 caracteres se necessário
df = df[df['entity_ts'].dt.year != 2023]

In [59]:
df.shape

(3348370, 6)

## Add Holiday column to main df

|

In [None]:
# Convert 'date' column in df_holidays_lx to datetime
df_holidays_lx['date'] = pd.to_datetime(df_holidays_lx['date'])


# Extract hour and day from 'entity_ts'
df['entity_ts_hour'] = df['entity_ts'].dt.floor('H')  # Extrai a hora (arredondada para baixo)
df['entity_ts_day'] = df['entity_ts'].dt.normalize()  # Extrai a data (sem horário)

# Realize o merge
df = df.merge(df_holidays_lx, left_on='entity_ts_day', right_on='date', how='left')
df['holiday_lisbon'] = df['holiday_lisbon'].fillna(0)

# Exiba o DataFrame resultante
display(df.head())

  df['entity_ts_hour'] = df['entity_ts'].dt.floor('H')  # Extrai a hora (arredondada para baixo)


Unnamed: 0,desigcomercial,numbicicletas,numdocas,position,entity_ts,estado,entity_ts_hour,entity_ts_day,date,holiday_lisbon
0,417 - Av. Duque de Ávila / Jardim Arco Do Cego,3,23,"{""coordinates"":[-9.142703,38.735352],""type"":""P...",2022-01-01 13:38:03,active,2022-01-01 13:00:00,2022-01-01,2022-01-01,1.0
1,446 - Av. República / Interface Entrecampos,25,40,"{""coordinates"":[-9.14773,38.74456],""type"":""Poi...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,2022-01-01,1.0
2,416 - Av. República / Avenida Duque de Ávila,6,16,"{""coordinates"":[-9.14553,38.735304],""type"":""Po...",2022-01-01 13:38:03,active,2022-01-01 13:00:00,2022-01-01,2022-01-01,1.0
3,103 - Jardim da Água,16,20,"{""coordinates"":[-9.095019,38.761218],""type"":""P...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,2022-01-01,1.0
4,415 - Av. Duque de Ávila / Av. Conde Valbom,9,20,"{""coordinates"":[-9.148443,38.735188],""type"":""P...",2022-01-01 13:38:04,active,2022-01-01 13:00:00,2022-01-01,2022-01-01,1.0


In [61]:
df['holiday_lisbon'].value_counts()

holiday_lisbon
0.0    3207409
1.0     140961
Name: count, dtype: int64

## Add longituded and latitude of the station column

## Add weather info cols in main df

## Add col target - station load rate

## Agreggation hour level

In [None]:
- Rename original cols from main df 
- Remove id columns: make sure to remove id columns from the df
- Create dataset for data exploration:
    - normal dataset with
- Create dataset for time forecasting (1):
    - trasnform the dataset
    - create taget columns (station load for target station)
    - keep external variables (station_a, station_b, station_c, weather_info, is_holiday)
- Create dataset for time forecasting (2):
    - trasnform the dataset
    - create taget columns (station load for target station)
    - remove external variables

## Rename original cols from main df 

## Remove id columns: make sure to remove id columns from the df

# Create dataset for data exploration:

# Create dataset for time forecasting (1)

# Create dataset for time forecasting (2) - excluding external variables