# Creating a CSV file holding all appropriate data for Comune di Milano
#### Initialising the dataframe

In [161]:
import pandas as pd
import numpy as np

date_range = pd.date_range(start="2017-01-01", end="2024-12-01", freq="MS") 

df = pd.DataFrame({
    "row_id": range(len(date_range)),
    "date": date_range.strftime("%Y-%m")
})

df.head()

Unnamed: 0,row_id,date
0,0,2017-01
1,1,2017-02
2,2,2017-03
3,3,2017-04
4,4,2017-05


#### Adding data for the pollutants

In [162]:
pollutants = ["C6H6", "CO_8h", "NO2", "O3", "PM10", "PM25", "SO2"]

for pol in pollutants:
    df[pol] = pd.NA

for i in range(1, 9):
    filename = f"../original-datasets/ds5-{i}.csv"
    temp = pd.read_csv(filename, sep=";")

    temp["inquinante"] = temp["inquinante"].str.strip()
    temp["data"] = pd.to_datetime(temp["data"], format="%Y-%m-%d")
    temp["month"] = temp["data"].dt.strftime("%Y-%m")

    monthly_means = temp.groupby(["month", "inquinante"])["valore"].mean().reset_index()
    pivoted = monthly_means.pivot(index="month", columns="inquinante", values="valore").reset_index()

    df_merged = df.merge(pivoted, how="left", left_on="date", right_on="month", suffixes=('', '_new'))

    for pol in pollutants:
        new_col = f"{pol}_new"
        if new_col in df_merged.columns:
            df_merged[pol] = df_merged[pol].combine_first(df_merged[new_col])

    df = df_merged.drop(columns=[col for col in df_merged.columns if col.endswith('_new') or col == "month"])

df.head()


  df_merged[pol] = df_merged[pol].combine_first(df_merged[new_col])


Unnamed: 0,row_id,date,C6H6,CO_8h,NO2,O3,PM10,PM25,SO2
0,0,2017-01,3.746429,1.930882,108.92437,30.117647,68.744681,53.870968,3.96875
1,1,2017-02,3.363889,1.644737,83.210884,26.035714,70.789474,52.131579,5.9
2,2,2017-03,1.93012,1.231765,95.21875,74.424242,46.333333,30.973684,6.6
3,3,2017-04,0.85098,0.885294,71.869231,94.098039,28.604167,22.0,5.0
4,4,2017-05,0.872727,0.992208,69.894737,97.846154,20.238095,12.404762,4.416667


#### Low emission bus data

In [163]:
ds6 = pd.read_csv("../original-datasets/ds6.csv", sep=";")
ds6.columns = ds6.columns.str.strip()

ds6 = ds6.map(lambda x: x.strip() if isinstance(x, str) else x)

indicator_col = "Autobus a basse emissioni - indicatore per 100 autobus"
absolute_col = "Autobus a basse emissioni - valore assoluto"

df[indicator_col] = np.nan
df[absolute_col] = np.nan

for _, row in ds6.iterrows():
    year_str = str(int(row["Anno"]))
    match = df["date"].str.startswith(year_str)

    df.loc[match, "Autobus a basse emissioni - indicatore per 100 autobus"] = row["Autobus a basse emissioni - indicatore per 100 autobus"]
    df.loc[match, "Autobus a basse emissioni - valore assoluto"] = row["Autobus a basse emissioni - valore assoluto"]

df[df["date"].str.startswith("2017")][["date", indicator_col, absolute_col]].dropna()



Unnamed: 0,date,Autobus a basse emissioni - indicatore per 100 autobus,Autobus a basse emissioni - valore assoluto
0,2017-01,0.4,5.0
1,2017-02,0.4,5.0
2,2017-03,0.4,5.0
3,2017-04,0.4,5.0
4,2017-05,0.4,5.0
5,2017-06,0.4,5.0
6,2017-07,0.4,5.0
7,2017-08,0.4,5.0
8,2017-09,0.4,5.0
9,2017-10,0.4,5.0


#### Public bus emission classes

In [164]:
ds7 = pd.read_csv("../original-datasets/ds7.csv", sep=";")
ds7.columns = ds7.columns.str.strip()

emission_classes = ["Euro 4 o inferiore", "Euro 5", "Euro 6"]
for cls in emission_classes:
    df[f"Bus {cls}"] = np.nan

for idx, row in ds7.iterrows():
    year = str(row["Anno"])
    emission_class = f"Bus {row['Classe di emissione'].strip()}"
    value = row["Percentuale autobus utilizzati"]
    mask = df["date"].str.startswith(year)
    df.loc[mask, emission_class] = value

df.head()

Unnamed: 0,row_id,date,C6H6,CO_8h,NO2,O3,PM10,PM25,SO2,Autobus a basse emissioni - indicatore per 100 autobus,Autobus a basse emissioni - valore assoluto,Bus Euro 4 o inferiore,Bus Euro 5,Bus Euro 6
0,0,2017-01,3.746429,1.930882,108.92437,30.117647,68.744681,53.870968,3.96875,0.4,5.0,50.2,31.8,18.0
1,1,2017-02,3.363889,1.644737,83.210884,26.035714,70.789474,52.131579,5.9,0.4,5.0,50.2,31.8,18.0
2,2,2017-03,1.93012,1.231765,95.21875,74.424242,46.333333,30.973684,6.6,0.4,5.0,50.2,31.8,18.0
3,3,2017-04,0.85098,0.885294,71.869231,94.098039,28.604167,22.0,5.0,0.4,5.0,50.2,31.8,18.0
4,4,2017-05,0.872727,0.992208,69.894737,97.846154,20.238095,12.404762,4.416667,0.4,5.0,50.2,31.8,18.0


#### Annual public transport passengers

In [165]:
# Load ds8.csv
ds8 = pd.read_csv("../original-datasets/ds8.csv", sep=";")
ds8.columns = ds8.columns.str.strip()

# Initialize column
df["Domanda di TPL - Passeggeri annui per abitante"] = np.nan

# Assign values
for idx, row in ds8.iterrows():
    year = str(row["Anno"])
    value = row["Domanda di TPL - Passeggeri annui per abitante"]
    mask = df["date"].str.startswith(year)
    df.loc[mask, "Domanda di TPL - Passeggeri annui per abitante"] = value / 12

df.head()


Unnamed: 0,row_id,date,C6H6,CO_8h,NO2,O3,PM10,PM25,SO2,Autobus a basse emissioni - indicatore per 100 autobus,Autobus a basse emissioni - valore assoluto,Bus Euro 4 o inferiore,Bus Euro 5,Bus Euro 6,Domanda di TPL - Passeggeri annui per abitante
0,0,2017-01,3.746429,1.930882,108.92437,30.117647,68.744681,53.870968,3.96875,0.4,5.0,50.2,31.8,18.0,
1,1,2017-02,3.363889,1.644737,83.210884,26.035714,70.789474,52.131579,5.9,0.4,5.0,50.2,31.8,18.0,
2,2,2017-03,1.93012,1.231765,95.21875,74.424242,46.333333,30.973684,6.6,0.4,5.0,50.2,31.8,18.0,
3,3,2017-04,0.85098,0.885294,71.869231,94.098039,28.604167,22.0,5.0,0.4,5.0,50.2,31.8,18.0,
4,4,2017-05,0.872727,0.992208,69.894737,97.846154,20.238095,12.404762,4.416667,0.4,5.0,50.2,31.8,18.0,


#### Number of vehicles in the city per type

In [166]:
ds9 = pd.read_csv("../original-datasets/ds9.csv", sep=";")
ds9.columns = ds9.columns.str.strip()

vehicle_columns = [col for col in ds9.columns if col != "Anno"]

for col in vehicle_columns:
    df[col] = pd.NA

months_per_year = 12

for idx, row in ds9.iterrows():
    year = row["Anno"]
    start_idx = (year - 2017) * months_per_year
    end_idx = start_idx + months_per_year

    if end_idx > len(df):
        end_idx = len(df)

    for col in vehicle_columns:
        df.loc[start_idx:end_idx - 1, col] = row[col]

df.head()


Unnamed: 0,row_id,date,C6H6,CO_8h,NO2,O3,PM10,PM25,SO2,Autobus a basse emissioni - indicatore per 100 autobus,...,AUTOCARRI TRASPORTO MERCI,AUTOVEICOLI SPECIALI - SPECIFICI,AUTOVETTURE,MOTOCARRI E QUADRICICLI TRASPORTO MERCI,MOTOCICLI,MOTOVEICOLI E QUADRICICLI SPECIALI - SPECIFICI,RIMORCHI E SEMIRIMORCHI SPECIALI - SPECIFICI,RIMORCHI E SEMIRIMORCHI TRASPORTO MERCI,TRATTORI STRADALI O MOTRICI,ALTRI VEICOLI
0,0,2017-01,3.746429,1.930882,108.92437,30.117647,68.744681,53.870968,3.96875,0.4,...,63212,11111,700723,1111,166029,1464,2536,4433,2872,0
1,1,2017-02,3.363889,1.644737,83.210884,26.035714,70.789474,52.131579,5.9,0.4,...,63212,11111,700723,1111,166029,1464,2536,4433,2872,0
2,2,2017-03,1.93012,1.231765,95.21875,74.424242,46.333333,30.973684,6.6,0.4,...,63212,11111,700723,1111,166029,1464,2536,4433,2872,0
3,3,2017-04,0.85098,0.885294,71.869231,94.098039,28.604167,22.0,5.0,0.4,...,63212,11111,700723,1111,166029,1464,2536,4433,2872,0
4,4,2017-05,0.872727,0.992208,69.894737,97.846154,20.238095,12.404762,4.416667,0.4,...,63212,11111,700723,1111,166029,1464,2536,4433,2872,0


#### Copying missing data for air quality (2023-05 to 2023-09 into 2024-05 to 2024-09)

In [167]:
import calendar

air_cols = ["C6H6", "CO_8h", "NO2", "O3", "PM10", "PM25", "SO2"]

for col in air_cols:
    for month in range(5, 10):
        target_date = f"2024-{month:02d}"
        source_date = f"2023-{month:02d}"
        df.loc[df["date"] == target_date, col] = df.loc[df["date"] == source_date, col].values


#### Extrapolating trends for vehicles and transported passengers based on previous data

In [168]:
non_air_cols = [col for col in df.columns if col not in air_cols + ["row_id", "date"]]

for col in non_air_cols:
    annual_means = df.groupby(df["date"].str[:4])[col].mean().dropna()

    valid_years = [int(y) for y in annual_means.index if y.isdigit()]
    
    if len(valid_years) < 2:
        continue

    y_start, y_end = valid_years[0], valid_years[-1]
    v_start, v_end = annual_means[str(y_start)], annual_means[str(y_end)]
    avg_delta = (v_end - v_start) / (y_end - y_start)

    for target_year in range(y_end + 1, 2025):
        extrapolated_value = v_end + avg_delta * (target_year - y_end)
        year_mask = df["date"].str.startswith(str(target_year))
        df.loc[year_mask, col] = extrapolated_value


df[non_air_cols] = df[non_air_cols].apply(lambda col: col.round(2) if col.dtype != "object" else col)

#### Dropping and renaming columns for data alignment

In [169]:
df = df.drop(columns=["row_id"])
df = df.rename(columns={
    "date": "DATE",
    "C6H6": "C6H6 (BENZENE)",
    "CO_8h": "CO (MONOSSIDO DI CARBONIO)",
    "NO2": "NO2 (DIOSSIDO DI AZOTO)",
    "O3": "O3 (OZONO)",
    "SO2": "SO2 (DIOSSIDO DI SULFURIO)",
    "Domanda di TPL - Passeggeri annui per abitante": "URBAN PASSENGERS MILAN"
})

#### Display all data for checking the correctness

In [170]:
import numpy as np
df = df.map(lambda x: np.nan if pd.isna(x) else x)
df.head(96)

Unnamed: 0,DATE,C6H6 (BENZENE),CO (MONOSSIDO DI CARBONIO),NO2 (DIOSSIDO DI AZOTO),O3 (OZONO),PM10,PM25,SO2 (DIOSSIDO DI SULFURIO),Autobus a basse emissioni - indicatore per 100 autobus,Autobus a basse emissioni - valore assoluto,...,AUTOCARRI TRASPORTO MERCI,AUTOVEICOLI SPECIALI - SPECIFICI,AUTOVETTURE,MOTOCARRI E QUADRICICLI TRASPORTO MERCI,MOTOCICLI,MOTOVEICOLI E QUADRICICLI SPECIALI - SPECIFICI,RIMORCHI E SEMIRIMORCHI SPECIALI - SPECIFICI,RIMORCHI E SEMIRIMORCHI TRASPORTO MERCI,TRATTORI STRADALI O MOTRICI,ALTRI VEICOLI
0,2017-01,3.746429,1.930882,108.924370,30.117647,68.744681,53.870968,3.968750,0.4,5.0,...,63212.0,11111.0,700723.0,1111.0,166029.0,1464.0,2536.0,4433.0,2872.0,0.0
1,2017-02,3.363889,1.644737,83.210884,26.035714,70.789474,52.131579,5.900000,0.4,5.0,...,63212.0,11111.0,700723.0,1111.0,166029.0,1464.0,2536.0,4433.0,2872.0,0.0
2,2017-03,1.930120,1.231765,95.218750,74.424242,46.333333,30.973684,6.600000,0.4,5.0,...,63212.0,11111.0,700723.0,1111.0,166029.0,1464.0,2536.0,4433.0,2872.0,0.0
3,2017-04,0.850980,0.885294,71.869231,94.098039,28.604167,22.000000,5.000000,0.4,5.0,...,63212.0,11111.0,700723.0,1111.0,166029.0,1464.0,2536.0,4433.0,2872.0,0.0
4,2017-05,0.872727,0.992208,69.894737,97.846154,20.238095,12.404762,4.416667,0.4,5.0,...,63212.0,11111.0,700723.0,1111.0,166029.0,1464.0,2536.0,4433.0,2872.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,2024-08,0.539024,0.421429,42.000000,126.730769,18.714286,10.702128,2.367647,5.3,75.0,...,64614.8,11889.4,678169.0,1092.8,196157.0,1753.8,3733.0,6292.2,4357.4,0.0
92,2024-09,0.901587,0.890909,64.182692,111.833333,24.589744,13.800000,4.809524,5.3,75.0,...,64614.8,11889.4,678169.0,1092.8,196157.0,1753.8,3733.0,6292.2,4357.4,0.0
93,2024-10,1.206061,0.855682,48.172727,46.568182,25.486111,15.174603,3.954545,5.3,75.0,...,64614.8,11889.4,678169.0,1092.8,196157.0,1753.8,3733.0,6292.2,4357.4,0.0
94,2024-11,2.166667,1.408333,61.533333,27.416667,44.424242,30.196078,6.055556,5.3,75.0,...,64614.8,11889.4,678169.0,1092.8,196157.0,1753.8,3733.0,6292.2,4357.4,0.0


#### Export to CSV

In [171]:
df.to_csv("../mashup-datasets/MD2.csv", index=False)