In [2]:
from shapely.geometry import Point
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import os

### Load air_quality datasets

In [3]:
df_contaminants = pd.read_csv("../data/raw/qualitat-aire-contaminants/qualitat_aire_contaminants.csv")
df_contaminants['Contaminant_Units'] = df_contaminants['Desc_Contaminant'] + ' (' + df_contaminants['Unitats'] + ')'
df_contaminants.drop(columns=['Desc_Contaminant', 'Unitats'], inplace=True)
df_contaminants.head(50)

Unnamed: 0,Codi_Contaminant,Contaminant_Units
0,1,SO2 (µg/m³)
1,6,CO (mg/m³)
2,7,NO (µg/m³)
3,8,NO2 (µg/m³)
4,9,PM2.5 (µg/m³)
5,10,PM10 (µg/m³)
6,12,NOx (µg/m³)
7,14,O3 (µg/m³)
8,22,Black Carbon (µg/m³)
9,101,SO2* (µg/m³)


In [4]:
df_contaminants['Contaminant_Units'].unique()

array(['SO2 (µg/m³)', 'CO (mg/m³)', 'NO (µg/m³)', 'NO2 (µg/m³)',
       'PM2.5 (µg/m³)', 'PM10 (µg/m³)', 'NOx (µg/m³)', 'O3 (µg/m³)',
       'Black Carbon (µg/m³)', 'SO2* (µg/m³)', 'CO* (mg/m³)',
       'NO* (µg/m³)', 'NO2* (µg/m³)', 'PM2.5* (µg/m³)', 'PM10* (µg/m³)',
       'Nox* (µg/m³)', 'O3* (µg/m³)', nan,
       'Flow C (Mesura interna equip)  ( )', 'Biomassa Black Carbon (%)'],
      dtype=object)

In [5]:
df_stations = pd.read_csv("../data/raw/qualitat-aire-estacions-bcn/2019_Qualitat_Aire_Estacions.csv")
print(df_stations.columns)
df_stations = df_stations[['Estacio', 'Nom_districte', 'Longitud', 'Latitud']].sort_values(by='Estacio').drop_duplicates()
df_stations.head()

Index(['Estacio', 'nom_cabina', 'codi_dtes', 'zqa', 'codi_eoi', 'Longitud',
       'Latitud', 'ubicacio', 'Codi_districte', 'Nom_districte', 'Codi_barri',
       'Nom_barri', 'Clas_1', 'Clas_2', 'Codi_Contaminant'],
      dtype='object')


Unnamed: 0,Estacio,Nom_districte,Longitud,Latitud
27,4,Sant Marti,2.2045,41.4039
29,42,Sants-Montjuic,2.1331,41.3788
4,43,Eixample,2.1538,41.3853
17,44,Gracia,2.1534,41.3987
3,50,Ciutat Vella,2.1874,41.3864


In [6]:
def load_and_concat_csvs(folder_path):
    # Get all CSV files in the folder
    csv_files = []
    for file in os.listdir(folder_path):
        full_path = os.path.join(folder_path, file)
        if os.path.isfile(full_path) and file.endswith(".csv"):
            csv_files.append(full_path)
    
    # Read files (skip headers after the first file)
    df_list = []
    for i, file in enumerate(csv_files):
        df = pd.read_csv(file)
        df_list.append(df)
    
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

# Usage
folder_path = "../data/raw/qualitat-aire-detall-bcn"
df_air = load_and_concat_csvs(folder_path)
df_air.head()

Unnamed: 0,CODI_PROVINCIA,PROVINCIA,CODI_MUNICIPI,MUNICIPI,ESTACIO,CODI_CONTAMINANT,ANY,MES,DIA,H01,...,H20,V20,H21,V21,H22,V22,H23,V23,H24,V24
0,8,Barcelona,19,Barcelona,4,7,2019,4,2,1.0,...,8.0,V,3.0,V,1.0,V,1.0,V,1.0,V
1,8,Barcelona,19,Barcelona,4,7,2019,4,3,1.0,...,3.0,V,2.0,V,2.0,V,2.0,V,2.0,V
2,8,Barcelona,19,Barcelona,4,7,2019,4,4,1.0,...,2.0,V,2.0,V,1.0,V,2.0,V,2.0,V
3,8,Barcelona,19,Barcelona,4,7,2019,4,5,2.0,...,3.0,V,2.0,V,2.0,V,1.0,V,2.0,V
4,8,Barcelona,19,Barcelona,4,7,2019,4,6,1.0,...,6.0,V,2.0,V,2.0,V,1.0,V,3.0,V


In [7]:
valid_values = [8, 9, 14, 108, 109, 114]
duplicated_values = [108, 109, 114]

df_air = df_air[df_air['CODI_CONTAMINANT'].isin(valid_values)]
df_air['CODI_CONTAMINANT'] = df_air['CODI_CONTAMINANT'].apply(lambda x: x-100 if x in duplicated_values else x)
df_air.head()

Unnamed: 0,CODI_PROVINCIA,PROVINCIA,CODI_MUNICIPI,MUNICIPI,ESTACIO,CODI_CONTAMINANT,ANY,MES,DIA,H01,...,H20,V20,H21,V21,H22,V22,H23,V23,H24,V24
29,8,Barcelona,19,Barcelona,4,8,2019,4,2,22.0,...,72.0,V,44.0,V,21.0,V,12.0,V,9.0,V
30,8,Barcelona,19,Barcelona,4,8,2019,4,3,6.0,...,31.0,V,27.0,V,28.0,V,38.0,V,40.0,V
31,8,Barcelona,19,Barcelona,4,8,2019,4,4,5.0,...,27.0,V,24.0,V,26.0,V,60.0,V,66.0,V
32,8,Barcelona,19,Barcelona,4,8,2019,4,5,67.0,...,26.0,V,18.0,V,18.0,V,15.0,V,16.0,V
33,8,Barcelona,19,Barcelona,4,8,2019,4,6,29.0,...,28.0,V,32.0,V,44.0,V,60.0,V,74.0,V


### Merge dataframes

In [8]:
# Melt the hourly columns into rows
df_melted = df_air.melt(
    id_vars=['ESTACIO', 'CODI_CONTAMINANT', 'ANY', 'MES', 'DIA'],
    value_vars=[f'H{i:02d}' for i in range(1, 25)],
    var_name='HourColumn',
    value_name='Value'
)

# Convert hour code to integer (H01=1 -> 01:00, H24=24 -> next day's 00:00)
df_melted['hour'] = df_melted['HourColumn'].str[1:].astype(int)

# Create base date from components
df_melted['base_date'] = pd.to_datetime(
    df_melted['ANY'].astype(str) + '-' +
    df_melted['MES'].astype(str).str.zfill(2) + '-' +
    df_melted['DIA'].astype(str).str.zfill(2)
)

# Add hours as timedelta (handles H24 automatically)
df_melted['timestamp'] = df_melted['base_date'] + pd.to_timedelta(df_melted['hour'], unit='h')

# Select final columns
result = df_melted[['timestamp', 'ESTACIO', 'CODI_CONTAMINANT', 'Value']]\
           .sort_values('timestamp')\
           .reset_index(drop=True)

df_air_clean = result
df_air_clean.head()

Unnamed: 0,timestamp,ESTACIO,CODI_CONTAMINANT,Value
0,2019-04-02 01:00:00,4,8,22.0
1,2019-04-02 01:00:00,57,14,44.0
2,2019-04-02 01:00:00,54,8,98.0
3,2019-04-02 01:00:00,50,14,64.0
4,2019-04-02 01:00:00,50,8,32.0


In [9]:
df_merged = df_air_clean.merge(df_stations, left_on='ESTACIO', right_on='Estacio', how='left')
df_merged = df_merged.merge(df_contaminants, left_on='CODI_CONTAMINANT', right_on='Codi_Contaminant', how='left')
df_merged.drop(columns=['ESTACIO', 'CODI_CONTAMINANT'], inplace=True)
df_merged.sort_values(by=['timestamp', 'Estacio', 'Nom_districte'])
df_merged.head(10)

Unnamed: 0,timestamp,Value,Estacio,Nom_districte,Longitud,Latitud,Codi_Contaminant,Contaminant_Units
0,2019-04-02 01:00:00,22.0,4.0,Sant Marti,2.2045,41.4039,8,NO2 (µg/m³)
1,2019-04-02 01:00:00,44.0,57.0,Les Corts,2.1151,41.3875,14,O3 (µg/m³)
2,2019-04-02 01:00:00,98.0,54.0,Horta-Guinardo,2.148,41.4261,8,NO2 (µg/m³)
3,2019-04-02 01:00:00,64.0,50.0,Ciutat Vella,2.1874,41.3864,14,O3 (µg/m³)
4,2019-04-02 01:00:00,32.0,50.0,Ciutat Vella,2.1874,41.3864,8,NO2 (µg/m³)
5,2019-04-02 01:00:00,44.0,44.0,Gracia,2.1534,41.3987,14,O3 (µg/m³)
6,2019-04-02 01:00:00,28.0,57.0,Les Corts,2.1151,41.3875,8,NO2 (µg/m³)
7,2019-04-02 01:00:00,56.0,44.0,Gracia,2.1534,41.3987,8,NO2 (µg/m³)
8,2019-04-02 01:00:00,75.0,43.0,Eixample,2.1538,41.3853,14,O3 (µg/m³)
9,2019-04-02 01:00:00,27.0,43.0,Eixample,2.1538,41.3853,8,NO2 (µg/m³)


In [10]:
df_merged['Contaminant_Units'].unique()

array(['NO2 (µg/m³)', 'O3 (µg/m³)', 'PM2.5 (µg/m³)'], dtype=object)

In [11]:
# Pivot the table to create columns for each Contaminant_Units
df_pivot = df_merged.pivot_table(
    index=['timestamp', 'Estacio', 'Nom_districte', 'Longitud', 'Latitud'],
    columns='Contaminant_Units',
    values='Value',
    aggfunc='first'  
).reset_index()

# Clean up column names
df_pivot.columns.name = None  
df_pivot = df_pivot.rename_axis(columns=None)
df_pivot.head()

Unnamed: 0,timestamp,Estacio,Nom_districte,Longitud,Latitud,NO2 (µg/m³),O3 (µg/m³),PM2.5 (µg/m³)
0,2019-04-02 01:00:00,4.0,Sant Marti,2.2045,41.4039,22.0,,
1,2019-04-02 01:00:00,42.0,Sants-Montjuic,2.1331,41.3788,29.0,,
2,2019-04-02 01:00:00,43.0,Eixample,2.1538,41.3853,27.0,75.0,
3,2019-04-02 01:00:00,44.0,Gracia,2.1534,41.3987,56.0,44.0,
4,2019-04-02 01:00:00,50.0,Ciutat Vella,2.1874,41.3864,32.0,64.0,


In [12]:
# District-Hour Median Imputation (Spatial-Temporal)
df_pivot['hour'] = df_pivot['timestamp'].dt.hour

for poll in ['NO2 (µg/m³)', 'O3 (µg/m³)', 'PM2.5 (µg/m³)']:
    # 1. District-hour median
    df_pivot[poll] = df_pivot.groupby(['Nom_districte', 'hour'])[poll].transform(
        lambda x: x.fillna(x.median()))
    
    # 2. District-day median (same district, all hours)
    df_pivot[poll] = df_pivot.groupby('Nom_districte')[poll].transform(
        lambda x: x.fillna(x.median()))
    
    # 3. City-hour median (all districts, same hour)
    df_pivot[poll] = df_pivot.groupby('hour')[poll].transform(
        lambda x: x.fillna(x.median()))
    
    # 4. Global median (last resort)
    df_pivot[poll] = df_pivot[poll].fillna(df_pivot[poll].median())

# Final cleanup
df_pivot = df_pivot.drop(columns=['hour'])
df_pivot = df_pivot.sort_values(['timestamp', 'Estacio'], ascending=True)  

df_pivot.head()

Unnamed: 0,timestamp,Estacio,Nom_districte,Longitud,Latitud,NO2 (µg/m³),O3 (µg/m³),PM2.5 (µg/m³)
0,2019-04-02 01:00:00,4.0,Sant Marti,2.2045,41.4039,22.0,47.0,8.0
1,2019-04-02 01:00:00,42.0,Sants-Montjuic,2.1331,41.3788,29.0,47.0,8.0
2,2019-04-02 01:00:00,43.0,Eixample,2.1538,41.3853,27.0,75.0,10.0
3,2019-04-02 01:00:00,44.0,Gracia,2.1534,41.3987,56.0,44.0,8.0
4,2019-04-02 01:00:00,50.0,Ciutat Vella,2.1874,41.3864,32.0,64.0,8.0


In [13]:
# Save processed table
df_pivot.to_parquet("../data/processed/air_quality/air_quality.parquet")