# Raw data preparation

- creating dataframes
- cleaning dataframes
- normalizing dataframes
- assigning labels

The raw data is in NetCDF4 format. The code below pre-processes the data per climatological year.

## Creating dataframes

In [1]:
def is_heatwave_df(year):
    file_pattern = '../../private/is_heatwave/Europe_0.25deg/Europe_0.25deg_' + year + '-*.nc'
    file_paths = glob.glob(file_pattern)
    
    ds = xr.open_mfdataset(file_paths)

    df = ds.to_dataframe().reset_index()
    df.fillna(0, inplace=True)

    # Save the merged data to a new NetCDF file
    if os.path.exists("data/" + year + "/ih_" + year + ".csv"):
        os.remove("data/" + year + "/ih_" + year + ".csv")
        df.to_csv("data/" + year + "/ih_" + year + ".csv", index=False)
    else:
        df.to_csv("data/" + year + "/ih_" + year + ".csv", index=False)

In [2]:
def comp_net_df(year):
    run1 = 20240122 if int(year) <= 1999 else (20240105 if 2000 <= int(year) <= 2009 else 20231214)
    run2 = 1017 if int(year) <= 1999 else (1808 if 2000 <= int(year) <= 2009 else 2202)
    
    run1 = str(run1)
    run2 = str(run2)
    
    # BC data
    file_pattern = '../../data/latest/iw-heatwaves-storage-master/output/master/' + run1 + '_' + run2 + '/rasterfiles/Europe/' + year + '/CN_Europe_0.25x0.25deg_BC_' + year + '-*.nc'
    file_paths = glob.glob(file_pattern)
    bc_2003 = xr.open_mfdataset(file_paths) # Open and concatenate files
    bc_2003 = bc_2003.rename({'coefficient': 'BC'}) # Rename coefficient

    # CC data
    file_pattern = '../../data/latest/iw-heatwaves-storage-master/output/master/' + run1 + '_' + run2 + '/rasterfiles/Europe/' + year + '/CN_Europe_0.25x0.25deg_CC_' + year + '-*.nc'
    file_paths = glob.glob(file_pattern)
    cc_2003 = xr.open_mfdataset(file_paths) # Open and concatenate files
    cc_2003 = cc_2003.rename({'coefficient': 'CC'}) # Rename coefficient

    # DC data
    file_pattern = '../../data/latest/iw-heatwaves-storage-master/output/master/' + run1 + '_' + run2 + '/rasterfiles/Europe/' + year + '/CN_Europe_0.25x0.25deg_DC_' + year + '-*.nc'
    file_paths = glob.glob(file_pattern)
    dc_2003 = xr.open_mfdataset(file_paths) # Open and concatenate files
    dc_2003 = dc_2003.rename({'coefficient': 'DC'}) # Rename coefficient

    # ID data
    file_pattern = '../../data/latest/iw-heatwaves-storage-master/output/master/' + run1 + '_' + run2 + '/rasterfiles/Europe/' + year + '/CN_Europe_0.25x0.25deg_ID_' + year + '-*.nc'
    file_paths = glob.glob(file_pattern)
    id_2003 = xr.open_mfdataset(file_paths) # Open and concatenate files
    id_2003 = id_2003.rename({'coefficient': 'ID'}) # Rename coefficient

    # OD data
    file_pattern = '../../data/latest/iw-heatwaves-storage-master/output/master/' + run1 + '_' + run2 + '/rasterfiles/Europe/' + year + '/CN_Europe_0.25x0.25deg_OD_' + year + '-*.nc'
    file_paths = glob.glob(file_pattern)
    od_2003 = xr.open_mfdataset(file_paths) # Open and concatenate files
    od_2003 = od_2003.rename({'coefficient': 'OD'}) # Rename coefficient

    # Merge datasets
    ds = xr.combine_by_coords([bc_2003, cc_2003, dc_2003, id_2003, od_2003])

    # Convert to pandas
    df = ds.to_dataframe().reset_index()
    df = df.drop('spatial_ref', axis=1)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    
    # Save the merged data to a new csv file
    if os.path.exists("data/" + year + "/merged_" + year + ".csv"):
        os.remove("data/" + year + "/merged_" + year + ".csv")
        df.to_csv("data/" + year + "/merged_" + year + ".csv", index=False)
    else:
        df.to_csv("data/" + year + "/merged_" + year + ".csv", index=False)

## Cleaning dataframes (and merging)

In [3]:
def dup_lat_lon(df):
  grouped = df.groupby(['lat', 'lon', 'time'])
  duplicate_rows = grouped.filter(lambda x: len(x) > 1)
  unique_rows = df[~df.index.isin(duplicate_rows.index)]

  return duplicate_rows, unique_rows

In [4]:
def even_uneven(df):
    dup_rows, uni_rows = dup_lat_lon(df)
    dup_rows = dup_rows.reset_index(drop=True)

    even_df = dup_rows[dup_rows.index % 2 == 0]
    uneven_df = dup_rows[dup_rows.index % 2 != 0]
    
    even = pd.concat([uni_rows, even_df], ignore_index=True)
    uneven = pd.concat([uni_rows, uneven_df], ignore_index=True)
    
    return even, uneven

In [5]:
def clean(df1, df2):
    # Check for difference in time column
    time_difference = list(set(sorted(df2.time.unique())).difference(sorted(df1.time.unique())))
    df = df2[~df2['time'].isin(time_difference)]

    # Check for duplicates
    df = df.drop_duplicates()

    # Identify stuborn duplicates
    stub_dup = df.groupby(['lat', 'lon', 'time']).size().reset_index(name='count_')
    dup_dates = stub_dup[stub_dup['count_'] == 2].time.unique()
    df_sub = df[~df['time'].isin(dup_dates)]
    df_even = pd.DataFrame()
    df_uneven = pd.DataFrame()
    for i in range(len(dup_dates)):
        df_stub = df[df['time'] == dup_dates[i]]
        even, uneven = even_uneven(df_stub)
        df_even = pd.concat([df_even, even], ignore_index=True)
        df_uneven = pd.concat([df_uneven, uneven], ignore_index=True)
    df_zero = pd.concat([df_sub, df_even], ignore_index=True)
    df_notzero = pd.concat([df_sub, df_uneven], ignore_index=True)
    
    return df_zero, df_notzero

In [6]:
def cleaning(year):
    df1 = pd.read_csv('data/' + year + '/ih_' + year + '.csv')
    df2 = pd.read_csv('data/' + year + '/merged_' + year + '.csv')

    df_zero, df_notzero = clean(df1, df2)

    df1 = df1.sort_values(by=['y', 'x', 'time'], ascending=[True, True, True])
    df_zero = df_zero.sort_values(by=['lon', 'lat', 'time'], ascending=[True, True, True])
    df_notzero = df_notzero.sort_values(by=['lon', 'lat', 'time'], ascending=[True, True, True])

    md_zero = pd.concat([df1['is_heatwave'], df_zero], axis=1)
    md_notzero = pd.concat([df1['is_heatwave'], df_notzero], axis=1)

    md_zero.to_csv("data/" + year + "/md_zero_" + year + ".csv", index=False)
    md_notzero.to_csv("data/" + year + "/md_!zero_" + year + ".csv", index=False)

## Normalizing dataframes

In [7]:
def normalize(year):
    df1 = pd.read_csv('data/' + year + '/md_zero_' + year + '.csv')
    df2 = pd.read_csv('data/' + year + '/md_!zero_' + year + '.csv')

    # Select columns to normalize
    columns = df1.columns.difference(['is_heatwave', 'lon', 'lat', 'time'])
    
    # Apply MinMaxScaler
    scaler = MinMaxScaler()
    df1[columns] = scaler.fit_transform(df1[columns])
    df2[columns] = scaler.fit_transform(df2[columns])

    df1.to_csv('data/' + year + '/md_zero_' + year + '_norm.csv', index=False)
    df2.to_csv('data/' + year + '/md_!zero_' + year + '_norm.csv', index=False)

## Labeling dataframes (sources and sinks)

In [8]:
def sources_sinks(df):
    time = sorted(df.time.unique())
    data = []

    for i in time:
        day = df[df['time'] == i].reset_index(drop=True)
        day['Type'] = ''
        index_set = set(range(len(set(day.index))))

        # Identify coefficients source and sink values
        bc_source = np.where(day['BC'] == day['BC'].max())
        id_source = np.where(day['ID'] == day['ID'].min()) 
        od_source = np.where(day['OD'] == day['OD'].max()) # high OD is a source!!
        
        bc_sink = np.where(day['BC'] == df.BC.drop_duplicates().nsmallest(2).iloc[-1]) # the second smallest value
        id_sink = np.where(day['ID'] == day['ID'].max()) # high ID is a sink!!
        od_sink = np.where(day['OD'] == day['OD'].min()) 
    
        # Sources
        bc_set_source = set(bc_source[0])
        id_set_source = set(id_source[0])
        od_set_source = set(od_source[0])
        
        # Find common sources
        common_sources = id_set_source & od_set_source # Returns indexes
    
        # Sinks
        bc_set_sink = set(bc_sink[0])
        id_set_sink = set(id_sink[0])
        od_set_sink = set(od_sink[0])
        
        # Find common sinks
        common_sinks = id_set_sink & od_set_sink # Returns indexes

        # Identify not a heatwave
        bc_neither = np.where(day['BC'] == 0.0)
        cc_neither = np.where(day['CC'] == 0.0)
        dc_neither = np.where(day['DC'] == 0.0)
        id_neither = np.where(day['ID'] == 0.0) 
        od_neither = np.where(day['OD'] == 0.0)

        # Neither
        bc_set_neither = set(bc_neither[0])
        cc_set_neither = set(cc_neither[0])
        dc_set_neither = set(dc_neither[0])
        id_set_neither = set(id_neither[0])
        od_set_neither = set(od_neither[0])

        # Find common zeros
        common_zeros = bc_set_neither & cc_set_neither & dc_set_neither & id_set_neither & od_set_neither # Returns indexes

        if len(common_sources) == len(day) | len(common_sinks) == len(day):
            neither = list(index_set)
            day.loc[neither, 'Type'] = 0
            data.append(day)
        else:
            # Identify nodes part of the heatwave but not a source or sink
            union = common_sources | common_sinks
            part_of = index_set - union - common_zeros
            
            sources = list(common_sources)
            sinks = list(common_sinks)
            neither = list(common_zeros)
            part = list(part_of)
            day.loc[sources, 'Type'] = 1
            day.loc[part, 'Type'] = 2
            day.loc[sinks, 'Type'] = 3
            day.loc[neither, 'Type'] = 0
    
            data.append(day)

    df = pd.concat(data, ignore_index=True)
        
    return df

In [9]:
def labeling(year):
    df1 = pd.read_csv('data/' + year + '/md_zero_' + year + '_norm.csv')
    df2 = pd.read_csv('data/' + year + '/md_!zero_' + year + '_norm.csv')

    x = sources_sinks(df1)
    y = sources_sinks(df2)

    x.to_csv('data/' + year + '/clean_labeled_zero_'+ year + '.csv', index=False)
    y.to_csv('data/' + year + '/clean_labeled_!zero_'+ year + '.csv', index=False)

## Compiler/ executable

In [10]:
# Load libraries
import os
import glob
import numpy as np
import xarray as xr
import pandas as pd
import netCDF4 as nc
from sklearn.preprocessing import MinMaxScaler

In [11]:
def run(year):
    year = str(year)
    folder_path = "data/" + year  # Replace with the desired path

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    is_heatwave_df(year)
    comp_net_df(year)
    cleaning(year)
    normalize(year)
    labeling(year) # Only included when labels are manually assigned (takes too long to do for each year)

In [None]:
run(2017)