In [1]:
from importlib import reload

import h5py
import numpy as np
import sys
import utilities

reload(utilities)

<module 'utilities' from "/Users/gloirelinvani/Library/Mobile Documents/com~apple~CloudDocs/School/Magistère d'Informatique/LDD3/S6/Stage/Earthquakes/earthquakes/New_code/utilities.py">

# Parameters

In [2]:
hdf5_in_put_file_path = "Data/Data_min_mainshock_mag=6_min_stations_per_main_shock=3_min_after_shock_mag=5_after_shock_time_window=45_n_days_before_mainshock=7_n_days_after_mainshock=7.hdf5"  #Path to the HDF5 output file
hdf5_out_put_file_path = "Data/Interpolated_Data_and_labels_min_mainshock_mag=6_min_stations_per_main_shock=3_min_after_shock_mag=5_after_shock_time_window=45_n_days_before_mainshock=7_n_days_after_mainshock=7.hdf5"  #Path to the HDF5 output file
earth_radius = 6371  # km
grid_size_km = 250  # Size of the grid in km
cell_size_km = 5  #Cell size in km
cell_size_rads = cell_size_km / earth_radius
cell_size_degs = cell_size_rads * 180 / np.pi
scale_factor = 1e6  # Scale factor for the coordinates from km to mm
num_cells = int(grid_size_km / cell_size_km)  # number of cells in the lateral direction
min_stations_inside = 3  # minimal number of stations inside the grid
soft_labels = False  ## smoothed labels (from {0,1} to continuous, smoothed over space)
sigma_softlabels = 1  # sigma for the gaussian smoothing of the labels, expressed in number of cell sizes
sigma_interpolation = 8  ## expressed in number of cell sizes

# Functions to build displacement maps and labels for each main shock in input HDF5 file

In [15]:
## Global variables:
n_seq_init = 0  # Number of sequences processed
n_seq_discarded = 0  # Number of sequences discarded


def process_main_shock_data(hdf5_file_path, output_path):
    """Process main shocks data to generate interpolated displacement maps."""
    with h5py.File(hdf5_file_path, 'r') as file:
        for id_seq in file.keys():
            process_single_main_shock(file[id_seq], output_path, id_seq)
            ## file[id_seq] == grp
        


def calculate_interpolation_grid(main_shock_location, cell_size_degs, num_cells_half):
    min_lat = main_shock_location[0] - num_cells_half * cell_size_degs
    max_lat = main_shock_location[0] + num_cells_half * cell_size_degs
    min_lon = main_shock_location[1] - num_cells_half * cell_size_degs
    max_lon = main_shock_location[1] + num_cells_half * cell_size_degs
    n_pixels_lat = 2 * num_cells_half  # Number of discretizations in the latitude direction
    n_pixels_lon = n_pixels_lat  # Number of discretizations in the longitude direction (cells)
    return min_lat, max_lat, min_lon, max_lon, n_pixels_lat, n_pixels_lon


def process_single_main_shock(main_shock_group, output_path, id_seq):
    global n_seq_init, n_seq_discarded
    n_seq_init += 1
    print(f"Processing main shock {id_seq}...")
    gps_stations_displacements = main_shock_group['gps_stations_displacements'][()]
    # For each station, multiply the displacement by the scale factor and sum the displacements
    # over the time dimension
    ## gps_stations_displacements: (NbStations, NbDays, Displacement(X,Y,Z))
    gps_stations_displacements = (gps_stations_displacements * scale_factor) ## conevrt to mm
    gps_stations_displacements = (gps_stations_displacements).sum(axis=1)   # sum over days (gets rid of 0's)
    gps_stations_displacements = gps_stations_displacements.reshape(-1, 3)  ## (NbStations, (X,Y,Z))
    stations_positions = main_shock_group['stations_positions'][()]
    main_shock_location = main_shock_group.attrs['main_shock_location']
    after_shocks_locations = main_shock_group['aftershocks_locations'][()]
    after_shocks_locations = after_shocks_locations.reshape(-1, 2)  ## (NbAfterShocks, (X,Y))
    main_shock_day = main_shock_group.attrs['main_shock_day']
    main_shock_mag = main_shock_group.attrs['main_shock_magnitude']

    # Calculate the interpolation grid
    min_lat, max_lat, min_lon, max_lon, n_pixels_lat, n_pixels_lon = calculate_interpolation_grid(
        main_shock_location, cell_size_degs, int(num_cells / 2)
    )

    # Number of stations inside the grid
    n_stations_inside_grid = ((stations_positions[:, 0] >= min_lat) & (stations_positions[:, 0] <= max_lat) &
                              (stations_positions[:, 1] >= min_lon) & (stations_positions[:, 1] <= max_lon)).sum()

    # Skip if not enough stations
    if n_stations_inside_grid < min_stations_inside:
        n_seq_discarded += 1
        print('Skipping:', id_seq, 'with only', n_stations_inside_grid, 'stations inside the grid, out of total (downloaded) ',
              stations_positions.shape[0])
        return

    # Interpolate the data
    labels = np.zeros((n_pixels_lat, n_pixels_lon))
    # discretize the aftershocks 
    aftershocks_rows = ((after_shocks_locations[:, 0] - min_lat) / cell_size_degs).astype('int')
    aftershocks_cols = ((after_shocks_locations[:, 1] - min_lon) / cell_size_degs).astype('int')
    # mask to make sure no event is outside the grid
    aftershocks_mask = (aftershocks_rows < n_pixels_lat) & (aftershocks_rows >= 0) & (aftershocks_cols < n_pixels_lon) & (
            aftershocks_cols >= 0)
    aftershocks_rows = aftershocks_rows[aftershocks_mask]
    aftershocks_cols = aftershocks_cols[aftershocks_mask]
    if aftershocks_mask.sum() == 0:
        n_seq_discarded += 1
        print('Skipping:', id_seq, 'with no aftershocks inside the grid')
        return
    if soft_labels:
        utilities.create_soft_labels(labels, aftershocks_rows, aftershocks_cols, cell_size_rads,
                                     sigma_softlabels * cell_size_rads)
    else:
        labels[aftershocks_rows, aftershocks_cols] = 1

    interpolated_data = np.zeros((n_pixels_lat, n_pixels_lon, 3))
    utilities.fit_gps(
        interpolated_data,
        np.pi * stations_positions / 180,
        gps_stations_displacements,
        n_pixels_lat, n_pixels_lon,
        np.pi * min_lat / 180,
        np.pi * min_lon / 180,
        cell_size_rads,
        sigma_interpolation * cell_size_rads,
        min_w=0
    )

    #Save the interpolated data and labels to the output file
    save_interpolated_data(interpolated_data, labels, output_path, id_seq, main_shock_day, main_shock_mag)


def save_interpolated_data(interpolated_data, labels, output_path, main_shock_id, main_shock_day, main_shock_mag):
    with h5py.File(output_path, 'a') as f:
        if str(main_shock_id) in f:
            del f[str(main_shock_id)]
        grp = f.create_group(str(main_shock_id))
        grp.attrs['main_shock_day'] = main_shock_day
        grp.attrs['main_shock_magnitude'] = main_shock_mag

        grp.create_dataset('interpolated_displacement', data=interpolated_data)
        grp.create_dataset('labels', data=labels)

# Main

In [16]:
def main():
    process_main_shock_data(hdf5_in_put_file_path, hdf5_out_put_file_path)


if __name__ == '__main__':
    main()
    print('Number of sequences processed:', n_seq_init)
    print('Number of sequences discarded:', n_seq_discarded)
    print('Number of sequences kept:', n_seq_init - n_seq_discarded)

Processing main shock 103...
Skipping: 103 with only 2 stations inside the grid, out of total (downloaded)  3
Processing main shock 109...
Skipping: 109 with only 0 stations inside the grid, out of total (downloaded)  3
Processing main shock 110...
Skipping: 110 with only 0 stations inside the grid, out of total (downloaded)  3
Processing main shock 112...
Skipping: 112 with only 0 stations inside the grid, out of total (downloaded)  3
Processing main shock 114...
Skipping: 114 with only 2 stations inside the grid, out of total (downloaded)  4
Processing main shock 115...
Skipping: 115 with only 0 stations inside the grid, out of total (downloaded)  3
Processing main shock 116...
Skipping: 116 with only 0 stations inside the grid, out of total (downloaded)  3
Processing main shock 150...
Skipping: 150 with only 1 stations inside the grid, out of total (downloaded)  3
Processing main shock 156...
Skipping: 156 with only 0 stations inside the grid, out of total (downloaded)  3
Processing