## Filtering DS1

In [25]:
import pandas as pd

ds1 = pd.read_csv("../original-datasets/ds1.csv",
                            sep  = ';',
                            parse_dates = ['DATA_INIZIO', 'DATA_FINE'],
                            dtype = 'str'
                         )


In [26]:
# Converting the date column to datetime
ds1['DATA_INIZIO'] = pd.to_datetime(ds1['DATA_INIZIO'], utc=True)

# Drop empty rows
ds1 = ds1.dropna(subset=['DATA_INIZIO'])
ds1 = ds1.dropna(subset=['VALORE'])

# Convert datetime to year-month period
ds1['DATE'] = ds1['DATA_INIZIO'].dt.to_period('Y').astype('str')

# Convert value collumn to float
ds1['VALORE'] = ds1['VALORE'].astype(float)

# Convert mg to ug in UM column
ds1.loc[ds1['UM'] == 'mg/m3', 'VALORE'] *= 1000
ds1['UM'] = 'ug/m3'  # Set all units to ug

# Drop unneeded columns
ds1.drop(["DATA_INIZIO", "DATA_FINE"], axis=1, inplace=True)

  ds1['DATE'] = ds1['DATA_INIZIO'].dt.to_period('Y').astype('str')


In [34]:
agg_ds1 = ds1.groupby(['DATE','COD_STAZ','AGENTE'])['VALORE'].mean().reset_index()

In [103]:
# Create pivot table for groupby pollutant agents in the dataset and by month
ds1_pivot = agg_ds1.pivot_table(
                        index = 'DATE',
                        columns = ['AGENTE','COD_STAZ'],
                        values = 'VALORE'
        ).reset_index()

In [104]:
ds1_pivot.drop("O3 (OZONO)", axis=1, inplace=True)
ds1_pivot.drop(ds1_pivot.index[[0,-1]], inplace=True) # Drop 2016-12 and 2025-01 records

ds1_pivot.columns = ds1_pivot.columns.map('_'.join) # Join two column labels

# Round the floating numbers to three
air_bol = ds1_pivot.round(3).rename(columns:{"DATE_":"DATE"}, inplace=True)

  ds1_pivot.drop("O3 (OZONO)", axis=1, inplace=True)


Unnamed: 0,DATE_,C6H6 (BENZENE)_PORTA SAN FELICE,CO (MONOSSIDO DI CARBONIO)_PORTA SAN FELICE,NO (MONOSSIDO DI AZOTO)_PORTA SAN FELICE,NO2 (BIOSSIDO DI AZOTO)_GIARDINI MARGHERITA,NO2 (BIOSSIDO DI AZOTO)_PORTA SAN FELICE,NO2 (BIOSSIDO DI AZOTO)_VIA CHIARINI,NOX (OSSIDI DI AZOTO)_PORTA SAN FELICE,PM10_GIARDINI MARGHERITA,PM10_PORTA SAN FELICE,PM10_VIA CHIARINI,PM2.5_GIARDINI MARGHERITA,PM2.5_PORTA SAN FELICE
1,2017,1.388,729.081,23.224,24.859,46.095,20.384,81.613,25.065,28.844,27.625,18.429,19.881
2,2018,1.406,654.817,23.005,22.068,49.244,23.001,84.429,21.889,26.249,24.501,14.691,18.15
3,2019,1.118,663.849,20.273,20.584,46.473,20.712,77.483,22.105,25.443,24.62,13.728,16.234
4,2020,0.906,582.742,17.288,17.079,38.454,20.434,64.904,23.722,25.821,21.938,15.42,16.486
5,2021,0.942,651.191,17.949,16.537,43.389,18.525,70.821,22.755,25.875,21.49,13.983,15.898


## Filtering DS2

In [12]:
ds2 =  pd.read_csv("../original-datasets/ds2.csv",
                            sep  = ';',
                            dtype = 'str'
                         )

# Keep only total number of vehicles per CAP
ds2_filt = ds2[['Anno','CAP','totale_veicoli']].sort_values(by=['Anno','CAP'])

ds2_filt[['totale_veicoli']] = ds2_filt[['totale_veicoli']].apply(pd.to_numeric)

In [78]:
# Create a pivot table to have CAPs in column lables
ds2_pivot = ds2_filt.pivot_table(
                        index = 'Anno',
                        columns = ['Total_vehicles','CAP'],
                        values = 'totale_veicoli'
        ).reset_index().rename(columns={'Anno':'DATE'}, inplace=True)

## Merge DS1 & DS2

In [114]:
merged = air_bol.merge(ds2_pivot, how='inner', on='DATE')

In [115]:
merged

Unnamed: 0,DATE,C6H6 (BENZENE)_PORTA SAN FELICE,CO (MONOSSIDO DI CARBONIO)_PORTA SAN FELICE,NO (MONOSSIDO DI AZOTO)_PORTA SAN FELICE,NO2 (BIOSSIDO DI AZOTO)_GIARDINI MARGHERITA,NO2 (BIOSSIDO DI AZOTO)_PORTA SAN FELICE,NO2 (BIOSSIDO DI AZOTO)_VIA CHIARINI,NOX (OSSIDI DI AZOTO)_PORTA SAN FELICE,PM10_GIARDINI MARGHERITA,PM10_PORTA SAN FELICE,...,40132,40133,40134,40135,40136,40137,40138,40139,40141,ND
0,2019,1.118,663.849,20.273,20.584,46.473,20.712,77.483,22.105,25.443,...,15163.0,20637.0,8124.0,5011.0,5249.0,12561.0,20021.0,21381.0,11223.0,586.0
1,2020,0.906,582.742,17.288,17.079,38.454,20.434,64.904,23.722,25.821,...,14943.0,20779.0,8236.0,5033.0,5370.0,12610.0,19493.0,21527.0,11195.0,532.0
2,2021,0.942,651.191,17.949,16.537,43.389,18.525,70.821,22.755,25.875,...,14769.0,20719.0,8176.0,5080.0,5399.0,12600.0,18824.0,21565.0,11169.0,474.0
3,2022,0.916,580.956,18.659,18.425,38.931,15.885,67.371,23.189,27.175,...,14914.0,20467.0,8095.0,5093.0,5410.0,12518.0,19449.0,21544.0,11122.0,433.0
4,2023,0.946,490.897,16.317,15.603,42.747,16.287,67.606,19.801,22.342,...,14891.0,20871.0,8106.0,5094.0,5330.0,12603.0,19746.0,21579.0,11113.0,399.0
5,2024,0.987,505.863,20.895,14.774,28.433,15.678,60.445,22.074,25.489,...,15084.0,21469.0,8072.0,5065.0,5317.0,12630.0,20151.0,21667.0,11130.0,365.0


In [116]:
merged.to_csv("../mashup-datasets/mashup_ds1_2.csv", index=False)