In [2]:
import pandas as pd

In [3]:
df_inhaler = pd.read_parquet("../data/raw/iot_inhaler/inhaler_events.parquet", engine='pyarrow')
df_patients = pd.read_parquet("../data/raw/iot_inhaler/patients.parquet", engine='pyarrow')
df_air_quality = pd.read_parquet("../data/processed/air_quality/air_quality.parquet", engine='pyarrow')

In [4]:
df_inhaler.head()

Unnamed: 0,patient_id,timestamp,puffs,latitude,longitude,device_type,district,date
0,PAT-0000,2024-01-01 00:00:00,0,41.402396,2.155284,Symbicort Turbuhaler,Gracia,2024-01-01
1,PAT-0000,2024-01-01 01:00:00,0,41.402397,2.15532,Symbicort Turbuhaler,Gracia,2024-01-01
2,PAT-0000,2024-01-01 02:00:00,0,41.402397,2.155301,Symbicort Turbuhaler,Gracia,2024-01-01
3,PAT-0000,2024-01-01 03:00:00,0,41.40239,2.155292,Symbicort Turbuhaler,Gracia,2024-01-01
4,PAT-0000,2024-01-01 04:00:00,0,41.4024,2.155304,Symbicort Turbuhaler,Gracia,2024-01-01


In [5]:
df_patients.head()

Unnamed: 0,patient_id,gender,age,home_district,gema_severity,work_district,symbicort_adherence,base_exacerbation_risk,has_allergic_rhinitis,has_COPD
0,PAT-0000,F,53,Gracia,Intermittent,Gracia,0.629974,0.062403,1,0
1,PAT-0001,F,38,Sant Marti,Mild Persistent,Ciutat Vella,0.411125,0.199496,1,0
2,PAT-0002,F,30,Eixample,Mild Persistent,Eixample,0.670514,0.129461,0,0
3,PAT-0003,M,36,Les Corts,Mild Persistent,Les Corts,0.375026,0.209243,0,0
4,PAT-0004,M,47,Sants-Montjuic,Intermittent,Sants-Montjuic,0.321727,0.099393,0,0


In [6]:
df_air_quality.head()

Unnamed: 0,timestamp,Estacio,Nom_districte,Longitud,Latitud,NO2 (µg/m³),O3 (µg/m³),PM2.5 (µg/m³)
0,2019-04-02 01:00:00,4.0,Sant Marti,2.2045,41.4039,22.0,47.0,8.0
1,2019-04-02 01:00:00,42.0,Sants-Montjuic,2.1331,41.3788,29.0,47.0,8.0
2,2019-04-02 01:00:00,43.0,Eixample,2.1538,41.3853,27.0,75.0,10.0
3,2019-04-02 01:00:00,44.0,Gracia,2.1534,41.3987,56.0,44.0,8.0
4,2019-04-02 01:00:00,50.0,Ciutat Vella,2.1874,41.3864,32.0,64.0,8.0


In [7]:
df_air_quality['Nom_districte'].unique()

array(['Sant Marti', 'Sants-Montjuic', 'Eixample', 'Gracia',
       'Ciutat Vella', 'Horta-Guinardo', 'Les Corts'], dtype=object)

In [8]:
df_air_quality.rename(columns={'Nom_districte':'district',
                               'Estacio': 'station',
                               'Longitud': 'longitude',
                               'Latitud': 'latitude'},
                      inplace=True)

In [9]:
df_inhaler_columns = ['patient_id', 'timestamp', 'puffs','longitude', 'latitude']
df_air_quality_columns = ['timestamp', 'station', 'district', 'PM2.5 (µg/m³)','NO2 (µg/m³)', 'O3 (µg/m³)']

In [10]:
df_inhaler_columns = ['patient_id', 'timestamp', 'puffs', 'district','longitude', 'latitude']
df_air_quality_columns = ['timestamp', 'station', 'district', 'PM2.5 (µg/m³)','NO2 (µg/m³)', 'O3 (µg/m³)']
df_patients_columns = ['patient_id', 'gender', 'age', 'gema_severity', 'symbicort_adherence', 'base_exacerbation_risk', 'has_allergic_rhinitis', 'has_COPD']

# Convert timestamps to datetime and floor to hourly frequency
df_inhaler['timestamp'] = pd.to_datetime(df_inhaler['timestamp']).dt.floor('h')
df_air_quality['timestamp'] = pd.to_datetime(df_air_quality['timestamp']).dt.floor('h')

# Merge air quality data
merged_df = pd.merge(
    df_inhaler[df_inhaler_columns],
    df_air_quality[df_air_quality_columns],
    on=['district', 'timestamp'],
    how='left'
)

In [11]:
merged_df.head()

Unnamed: 0,patient_id,timestamp,puffs,district,longitude,latitude,station,PM2.5 (µg/m³),NO2 (µg/m³),O3 (µg/m³)
0,PAT-0000,2024-01-01 00:00:00,0,Gracia,2.155284,41.402396,44.0,8.1,42.0,13.0
1,PAT-0000,2024-01-01 01:00:00,0,Gracia,2.15532,41.402397,44.0,8.0,36.0,20.0
2,PAT-0000,2024-01-01 02:00:00,0,Gracia,2.155301,41.402397,44.0,8.0,32.0,25.0
3,PAT-0000,2024-01-01 03:00:00,0,Gracia,2.155292,41.40239,44.0,7.73,53.0,7.0
4,PAT-0000,2024-01-01 04:00:00,0,Gracia,2.155304,41.4024,44.0,7.45,42.0,16.0


In [12]:
# Select and order final columns
merged_df.to_parquet("../data/processed/inhaler_air_merged/", engine='pyarrow', partition_cols=['patient_id'])