In [12]:
import pandas as pd

In [13]:
df_inhaler = pd.read_parquet("../data/raw/iot_inhaler/inhaler_events.parquet", engine='pyarrow')
df_patients = pd.read_parquet("../data/raw/iot_inhaler/patients.parquet", engine='pyarrow')
df_air_quality = pd.read_parquet("../data/processed/air_quality/air_quality.parquet", engine='pyarrow')

In [14]:
df_inhaler.head()

Unnamed: 0,patient_id,timestamp,puffs,latitude,longitude,district,NO2,O3,PM2.5,hospitalization_flag,date
0,PAT-0001,2024-01-01 00:00:00,0.0,41.360588,2.130338,Sants-Montjuic,42.0,49.0,8.1,0,2024-01-01
1,PAT-0001,2024-01-01 01:00:00,0.0,41.359435,2.131716,Sants-Montjuic,42.0,47.0,8.0,0,2024-01-01
2,PAT-0001,2024-01-01 02:00:00,0.0,41.360395,2.132364,Sants-Montjuic,53.0,46.0,8.0,0,2024-01-01
3,PAT-0001,2024-01-01 03:00:00,0.0,41.358985,2.132746,Sants-Montjuic,47.0,46.0,7.73,0,2024-01-01
4,PAT-0001,2024-01-01 04:00:00,0.0,41.358632,2.133087,Sants-Montjuic,26.0,45.0,7.45,0,2024-01-01


In [15]:
df_patients.head()

Unnamed: 0,patient_id,gender,age,home_district,gema_severity,work_district,symbicort_adherence
0,PAT-0000,F,76,Sant Marti,Moderate Persistent,Horta-Guinardo,0.410187
1,PAT-0001,M,50,Sants-Montjuic,Intermittent,Gracia,0.698253
2,PAT-0002,M,45,Sants-Montjuic,Mild Persistent,Sants-Montjuic,0.917523
3,PAT-0003,F,51,Sant Marti,Intermittent,Sant Marti,0.795053
4,PAT-0004,M,56,Les Corts,Mild Persistent,Les Corts,0.4


In [16]:
df_air_quality.head()

Unnamed: 0,timestamp,Estacio,Nom_districte,Longitud,Latitud,NO2 (µg/m³),O3 (µg/m³),PM2.5 (µg/m³)
0,2019-04-02 01:00:00,4.0,Sant Marti,2.2045,41.4039,22.0,47.0,8.0
1,2019-04-02 01:00:00,42.0,Sants-Montjuic,2.1331,41.3788,29.0,47.0,8.0
2,2019-04-02 01:00:00,43.0,Eixample,2.1538,41.3853,27.0,75.0,10.0
3,2019-04-02 01:00:00,44.0,Gracia,2.1534,41.3987,56.0,44.0,8.0
4,2019-04-02 01:00:00,50.0,Ciutat Vella,2.1874,41.3864,32.0,64.0,8.0


In [17]:
df_air_quality['Nom_districte'].unique()

array(['Sant Marti', 'Sants-Montjuic', 'Eixample', 'Gracia',
       'Ciutat Vella', 'Horta-Guinardo', 'Les Corts'], dtype=object)

In [18]:
df_air_quality.rename(columns={'Nom_districte':'district',
                               'Estacio': 'station',
                               'Longitud': 'longitude',
                               'Latitud': 'latitude'},
                      inplace=True)

In [None]:
df_inhaler_columns = ['patient_id', 'timestamp', 'puffs', 'district','longitude', 'latitude']
df_air_quality_columns = ['timestamp', 'station', 'district', 'PM2.5 (µg/m³)','NO2 (µg/m³)', 'O3 (µg/m³)']
df_patients_columns = ['patient_id', 'gender', 'age', 'gema_severity', 'symbicort_adherence']

# Convert timestamps to datetime and floor to hourly frequency
df_inhaler['timestamp'] = pd.to_datetime(df_inhaler['timestamp']).dt.floor('h')
df_air_quality['timestamp'] = pd.to_datetime(df_air_quality['timestamp']).dt.floor('h')

# Merge air quality data
merged_df = pd.merge(
    df_inhaler[df_inhaler_columns],
    df_air_quality[df_air_quality_columns],
    on=['district', 'timestamp'],
    how='left'
)

In [20]:
merged_df.head()

Unnamed: 0,patient_id,timestamp,puffs,district,longitude,latitude,station,PM2.5 (µg/m³),NO2 (µg/m³),O3 (µg/m³)
0,PAT-0001,2024-01-01 00:00:00,0.0,Sants-Montjuic,2.130338,41.360588,42.0,8.1,42.0,49.0
1,PAT-0001,2024-01-01 01:00:00,0.0,Sants-Montjuic,2.131716,41.359435,42.0,8.0,42.0,47.0
2,PAT-0001,2024-01-01 02:00:00,0.0,Sants-Montjuic,2.132364,41.360395,42.0,8.0,53.0,46.0
3,PAT-0001,2024-01-01 03:00:00,0.0,Sants-Montjuic,2.132746,41.358985,42.0,7.73,47.0,46.0
4,PAT-0001,2024-01-01 04:00:00,0.0,Sants-Montjuic,2.133087,41.358632,42.0,7.45,26.0,45.0


In [21]:
# Select and order final columns
merged_df.to_parquet("../data/processed/inhaler_air_merged/", engine='pyarrow', partition_cols=['patient_id'])