In [1]:
import pandas as pd

In [2]:
df_inhaler = pd.read_parquet("../data/raw/iot_inhaler/inhaler_events.parquet", engine='pyarrow')
df_patients = pd.read_parquet("../data/raw/iot_inhaler/patients.parquet", engine='pyarrow')
df_air_quality = pd.read_parquet("../data/processed/air_quality/air_quality.parquet", engine='pyarrow')

In [3]:
df_inhaler.head()

Unnamed: 0,patient_id,timestamp,puffs,latitude,longitude,device_type,district,date
0,PAT-0000,2024-01-01 00:00:00,0,41.360109,2.133112,Symbicort Turbuhaler,Sants-Montjuic,2024-01-01
1,PAT-0000,2024-01-01 01:00:00,1,41.360069,2.133109,Symbicort Turbuhaler,Sants-Montjuic,2024-01-01
2,PAT-0000,2024-01-01 02:00:00,0,41.360095,2.133113,Symbicort Turbuhaler,Sants-Montjuic,2024-01-01
3,PAT-0000,2024-01-01 03:00:00,0,41.360104,2.133089,Symbicort Turbuhaler,Sants-Montjuic,2024-01-01
4,PAT-0000,2024-01-01 04:00:00,0,41.360092,2.133057,Symbicort Turbuhaler,Sants-Montjuic,2024-01-01


In [4]:
df_patients.head()

Unnamed: 0,patient_id,gender,age,home_district,gema_severity,work_district,symbicort_adherence,base_exacerbation_risk,has_allergic_rhinitis,has_COPD
0,PAT-0000,F,62,Sants-Montjuic,Intermittent,Sants-Montjuic,0.540804,0.073104,0,1
1,PAT-0001,M,45,Les Corts,Mild Persistent,Sant Marti,0.721616,0.115664,1,0
2,PAT-0002,F,39,Sant Marti,Moderate Persistent,Sant Marti,0.587797,0.241747,0,0
3,PAT-0003,M,41,Sant Marti,Intermittent,Sant Marti,0.581592,0.068209,1,0
4,PAT-0004,F,43,Ciutat Vella,Intermittent,Ciutat Vella,0.417218,0.087934,1,0


In [5]:
df_air_quality.head()

Unnamed: 0,timestamp,Estacio,Nom_districte,Longitud,Latitud,NO2 (µg/m³),O3 (µg/m³),PM2.5 (µg/m³)
0,2019-04-02 01:00:00,4.0,Sant Marti,2.2045,41.4039,22.0,47.0,8.0
1,2019-04-02 01:00:00,42.0,Sants-Montjuic,2.1331,41.3788,29.0,47.0,8.0
2,2019-04-02 01:00:00,43.0,Eixample,2.1538,41.3853,27.0,75.0,10.0
3,2019-04-02 01:00:00,44.0,Gracia,2.1534,41.3987,56.0,44.0,8.0
4,2019-04-02 01:00:00,50.0,Ciutat Vella,2.1874,41.3864,32.0,64.0,8.0


In [6]:
df_air_quality['Nom_districte'].unique()

array(['Sant Marti', 'Sants-Montjuic', 'Eixample', 'Gracia',
       'Ciutat Vella', 'Horta-Guinardo', 'Les Corts'], dtype=object)

In [7]:
df_air_quality.rename(columns={'Nom_districte':'district',
                               'Estacio': 'station',
                               'Longitud': 'longitude',
                               'Latitud': 'latitude'},
                      inplace=True)

In [8]:
df_inhaler_columns = ['patient_id', 'timestamp', 'puffs','longitude', 'latitude']
df_air_quality_columns = ['timestamp', 'station', 'district', 'PM2.5 (µg/m³)','NO2 (µg/m³)', 'O3 (µg/m³)']

In [12]:
df_inhaler_columns = ['patient_id', 'timestamp', 'puffs', 'district','longitude', 'latitude']
df_air_quality_columns = ['timestamp', 'station', 'district', 'PM2.5 (µg/m³)','NO2 (µg/m³)', 'O3 (µg/m³)']
df_patients_columns = ['patient_id', 'gender', 'age', 'gema_severity', 'symbicort_adherence', 'base_exacerbation_risk', 'has_allergic_rhinitis', 'has_COPD']

# Convert timestamps to datetime and floor to hourly frequency
df_inhaler['timestamp'] = pd.to_datetime(df_inhaler['timestamp']).dt.floor('h')
df_air_quality['timestamp'] = pd.to_datetime(df_air_quality['timestamp']).dt.floor('h')

# Merge air quality data
merged_df = pd.merge(
    df_inhaler[df_inhaler_columns],
    df_air_quality[df_air_quality_columns],
    on=['district', 'timestamp'],
    how='left'
)

In [14]:
merged_df.head()

Unnamed: 0,patient_id,timestamp,puffs,district,longitude,latitude,station,PM2.5 (µg/m³),NO2 (µg/m³),O3 (µg/m³)
0,PAT-0000,2024-01-01 00:00:00,0,Sants-Montjuic,2.133112,41.360109,42.0,8.1,42.0,49.0
1,PAT-0000,2024-01-01 01:00:00,1,Sants-Montjuic,2.133109,41.360069,42.0,8.0,42.0,47.0
2,PAT-0000,2024-01-01 02:00:00,0,Sants-Montjuic,2.133113,41.360095,42.0,8.0,53.0,46.0
3,PAT-0000,2024-01-01 03:00:00,0,Sants-Montjuic,2.133089,41.360104,42.0,7.73,47.0,46.0
4,PAT-0000,2024-01-01 04:00:00,0,Sants-Montjuic,2.133057,41.360092,42.0,7.45,26.0,45.0


In [16]:
# Select and order final columns
merged_df.to_parquet("../data/processed/inhaler_air_merged/", engine='pyarrow', partition_cols=['patient_id'])