In [3]:
import os
city = os.getenv("CITY", "tromso")  # default
print("Running preprocessing for:", city)

Running preprocessing for: tromso


In [4]:
from pathlib import Path
import pandas as pd

root_dir = Path().absolute()
# Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
if root_dir.parts[-1:] == ('airquality',):
    root_dir = Path(*root_dir.parts[:-1])
if root_dir.parts[-1:] == ('notebooks',):
    root_dir = Path(*root_dir.parts[:-1])
root_dir = str(root_dir) 
print("Local environment")

csv_file=f"{root_dir}/data/{city}.csv"

Local environment


In [5]:
df = pd.read_csv(csv_file,  parse_dates=['date'], skipinitialspace=True)
df["city"] = city
df

Unnamed: 0,date,pm25,pm10,no2,city
0,2025-11-01,19.0,7.0,,tromso
1,2025-11-02,19.0,7.0,,tromso
2,2025-11-03,20.0,5.0,,tromso
3,2025-11-04,15.0,5.0,,tromso
4,2025-11-05,16.0,5.0,,tromso
...,...,...,...,...,...
4143,2015-10-25,,,22.0,tromso
4144,2015-10-26,,,19.0,tromso
4145,2015-10-27,,,29.0,tromso
4146,2015-10-28,,,31.0,tromso


In [8]:
df_aq = df[['date', 'pm25', 'pm10', 'city']].copy()
df_aq['pm25'] = df_aq['pm25'].astype('float32')
df_aq['pm10'] = df_aq['pm10'].astype('float32')
df_aq

Unnamed: 0,date,pm25,pm10,city
0,2025-11-01,19.0,7.0,tromso
1,2025-11-02,19.0,7.0,tromso
2,2025-11-03,20.0,5.0,tromso
3,2025-11-04,15.0,5.0,tromso
4,2025-11-05,16.0,5.0,tromso
...,...,...,...,...
4143,2015-10-25,,,tromso
4144,2015-10-26,,,tromso
4145,2015-10-27,,,tromso
4146,2015-10-28,,,tromso


In [13]:
df_aq["suspect_spike"] = (
    (df_aq["pm25"] > 100) &
    (df_aq["pm10"] < df_aq["pm10"].median() * 2)
)

In [14]:
df_suspect_spikes = df_aq[df_aq["suspect_spike"]==True]
df_suspect_spikes

Unnamed: 0,date,pm25,pm10,city,suspect_spike
445,2024-01-19,102.0,7.0,tromso,True
446,2024-01-20,221.0,8.0,tromso,True
447,2024-01-21,310.0,9.0,tromso,True
448,2024-01-22,224.0,7.0,tromso,True
449,2024-01-23,152.0,6.0,tromso,True
1054,2022-04-26,169.0,6.0,tromso,True
1130,2022-01-20,104.0,11.0,tromso,True
3647,2014-07-15,107.0,14.0,tromso,True


These observations seem physically impossible and probably happened due to faults in the censor. Therefore these entries are removed below.

In [15]:
suspect_spikes_mask = (
    (df_aq["pm25"] > 100) &
    (df_aq["pm10"] < df_aq["pm10"].median() * 2)
)

df_clean = df_aq[~suspect_spikes_mask].copy()
df_clean

Unnamed: 0,date,pm25,pm10,city,suspect_spike
0,2025-11-01,19.0,7.0,tromso,False
1,2025-11-02,19.0,7.0,tromso,False
2,2025-11-03,20.0,5.0,tromso,False
3,2025-11-04,15.0,5.0,tromso,False
4,2025-11-05,16.0,5.0,tromso,False
...,...,...,...,...,...
4143,2015-10-25,,,tromso,False
4144,2015-10-26,,,tromso,False
4145,2015-10-27,,,tromso,False
4146,2015-10-28,,,tromso,False


In [16]:
print(f'CSV length before cleaning: {len(df_aq)}')
print(f'CSV length before cleaning: {len(df_clean)}')

CSV length before cleaning: 4148
CSV length before cleaning: 4140


In [17]:
df_clean.to_csv(csv_file, index=False)