<a href="" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src="./src/copernicus-logo.png"><span style="margin-left: 40px"></span><img src="./src/cds-logo.jpeg">

# Data Aggregation

Data Aggregation phase will integrates data from the same time and the same geographical references in a pandas dataset. The result will be a series of CSV files with discharges, temperatures and precipitations on a part of Italy that will be our domain of interest.

## Integration between TPI and RDH 

In [None]:
import netCDF4
import pandas as pd
import json

jsonFileR = open("./samples/map-italy-loc.json", "r") # file with the i.j indexes for the lat lon inside the north italy rectangle, saved in the same directory of coordinates
ljson = json.load(jsonFileR)
jsonFileR.close()

i_dim = 949 # index of the last element for temperature and precipitation since in a file i goes from 0 to 949 and in the other two it goes from 949 to 0

#header for the csv file
header = ['time', 'lat', 'lon', 'discharge', 'temp', 'prec']


d_years = ['2013-2014-2', '2015-2016-3', '2017-2018-4', '2019-2020-5', '2021-2022-6'] # years in the discharge file
p_t_years = ['2013-8', '2014-9', '2015-10', '2016-11', '2017-12', '2018-13', '2019-14', '2020-15', '2021-16'] # years in the temperature and precipitation file
d_years = d_years[2]
p_t_years = p_t_years[4:5]

for idf_d in range(0, len(d_years)):
#Edit file path
    nc_d = netCDF4.Dataset('./cs3-copernicus-precipitation-temp-on-river-discharges/cs3-copernicus-river-discharges-daily-18-2011-2021/rdh-'+d_years[idf_d] +'.nc')
    #for each discharge file there are two yers of temperature and precipitation files, so they are indexed adding to the index of the discharge file
    for idf_d_t in range(0, 2):
        year = p_t_years[idf_d+idf_d_t]
        nc_p = netCDF4.Dataset('./cs3-copernicus-precipitation-temp-on-river-discharges/cs3-copernicus-temperatures-and-precipitations-2006-2025/tpi-prec-' + year + '.nc')
        nc_t = netCDF4.Dataset('./cs3-copernicus-precipitation-temp-on-river-discharges/cs3-copernicus-temperatures-and-precipitations-2006-2025/tpi-temp-' + year + '.nc')

        #variables that need to be saved, avoiding to have too many indexes later
        #the time for the timestamp is taken from the precipitation file
        time_var = nc_p.variables['time']
        dtime = netCDF4.num2date(time_var[:],time_var.units)
        discharge = nc_d.variables['dis06']
        lat = nc_d.variables['latitude']
        lon = nc_d.variables['longitude']
        temp = nc_t.variables['tasAdjust']
        prec = nc_p.variables['prAdjust']
        #for each year write the header of the csv file
        ds = pd.DataFrame(data = [], columns = header)
        ds.to_csv('italy-dtp-' + year + '.csv', mode='w', index=True)
        #for each timestamp (so for each day) create a new empty dataframe
        for l in ljson:
            ds = pd.DataFrame(data = [], columns=header)
            len_ds = len(ds)
            #NOTICE: se vogliamo il dataframe con i valori per la stessa coordinata per tutti i giorni diversi messi in righe consecutive bisogna invertire i due for
            #Ora il dataframe è una cosa del tipo
            #time, lat_i, lon_j
            #time, lat_i+n, lon_j+m (ci sono n e m perchè sono stati tolti i valori nan, quindi potrebbero non essere consecutive anche se improbabile)
            #se invece invertiamo i for diventa
            #time, lat_i, lon_j
            #time+1, lat_i, lon_j
            #time+2, lat_i, lon_j
            #...
            #quindi decidi come è meglio, perchè io ho iniziato in questo modo ma non so se per l'analisi che dobbiamo fare noi conviene nell'altro
            #for each coordinate in the rectangle add a new row to the dataframe with the desired value
            for t in range(0, len(dtime)):
                ds.loc[len_ds] = [dtime[t], lat[l[0]][l[1]], lon[l[0]][l[1]], discharge[t+(365*idf_d_t)][l[0]][l[1]], temp[t][i_dim-l[0]][l[1]]-273.15, prec[t][i_dim-l[0]][l[1]]]
                len_ds += 1
            #when the timestamp is done write the dataframe to the file, without the header
            ds.to_csv('italy-dtp-' + year + '.csv', mode='a', index=True, header=False)