# Importing Important modules

In [None]:
import ipynbname
import logging
import pkg_resources
import seisbench.data as sbd
import seisbench.util as sbu

from pathlib import Path
from obspy import read_events
from obspy import read
import pandas as pd
import os
from datetime import datetime
import sys

In [None]:
import warnings
warnings.simplefilter('ignore', DeprecationWarning)

In [None]:
lib_path = [r'C:\Users\ikahbasi\OneDrive\Applications\GitHub\SeisRoutine',
            r'C:\Users\ikahb\OneDrive\Applications\GitHub\SeisRoutine']
for path in lib_path:
    sys.path.append(path)
##########################################################################
import SeisRoutine.catalog as src
import SeisRoutine.waveform as srw
import SeisRoutine.config as srconf

In [None]:
from importlib import reload  # Python 3.4+
src = reload(src)
srw = reload(srw)

# Define Some Functions

In [None]:
def getting_filename_and_path_of_the_running_code():
    """
    Get the filename and directory path of the currently executing code.
    
    This function works for both regular Python scripts (.py files) and Jupyter Notebooks
    (.ipynb files). For notebooks, it handles both VS Code's environment and standard
    Jupyter environments.

    Returns:
        tuple: A tuple containing (directory_path, filename) of the running code.
        
    Note:
        In Jupyter Notebook environments, returns the notebook name and path.
        In regular Python scripts, returns the script name and path.
    """
    _file = sys.argv[0]
    name = os.path.basename(_file)
    path = os.path.dirname(_file)
    if name == "ipykernel_launcher.py":
        try:
            _file = globals()['__vsc_ipynb_file__']
            name = os.path.basename(_file)
            path = os.path.dirname(_file)
        except Exception as error:
            print(error)
            name = ipynbname.name()
            path = ipynbname.path()
    return path, name

In [None]:
class get_data:
    def __init__(self, root, pattern_path):
        self.root = root
        self.pattern_path = pattern_path
        self.stream = None
        self.stats  = None

    def read(self, time):
        pattern = self.pattern_path.format(time=time)
        path = f'{self.root}/{pattern}'
        logging.info(f'Reading Data: {path}')
        # print('Reading Data:', path)
        self.stream = read(path)
        self.preprocessing_data()
        self.stations = list({tr.stats.station for tr in self.stream})

    def get_data_related_to_pick(self, pick):
        if self.stream is None:
            self.read(time=pick.time)
        if not pick.waveform_id.station_code in self.stations:
            self.read(time=pick.time)
        if not pick.time.julday == self.stream[0].stats.starttime.julday:
            self.read(time=pick.time)
        target_stream = self.stream.select(station=pick.waveform_id.station_code)
        return target_stream
    
    def preprocessing_data(self):
        self.sps_check()
        self.stream.merge(-1)
        self.stream.detrend('constant')
        self.stream.merge()
        # self.stream.merge(method=1, fill_value=0)
        # self.stream.filter('bandpass', freqmin=0.5, freqmax=49, zerophase=True)
    
    def sps_check(self, sps=100):
        print('Available sps:', {tr.stats.sampling_rate for tr in self.stream})
        assert all(tr.stats.sampling_rate==sps for tr in self.stream)

In [None]:
def get_event_params(event):
    origin = event.preferred_origin()
    mag = event.preferred_magnitude()

    source_id = str(event.resource_id)

    event_params = {
        "source_id": source_id,
        "source_origin_time": str(origin.time),
        "source_origin_uncertainty_sec": origin.time_errors["uncertainty"],
        "source_latitude_deg": origin.latitude,
        "source_latitude_uncertainty_km": origin.latitude_errors["uncertainty"],
        "source_longitude_deg": origin.longitude,
        "source_longitude_uncertainty_km": origin.longitude_errors["uncertainty"],
        "source_depth_km": origin.depth / 1e3            if origin.depth else None,
        "source_depth_uncertainty_km": origin.depth_errors["uncertainty"] / 1e3           if origin.depth else None,
    }
    if mag is not None:
        event_params["source_magnitude"] = mag.mag
        event_params["source_magnitude_uncertainty"] = mag.mag_errors["uncertainty"]
        event_params["source_magnitude_type"] = mag.magnitude_type
        event_params["source_magnitude_author"] = mag.creation_info.agency_id
        event_params["split"] = None
    return event_params

In [None]:
def get_trace_params(pick):
    net = pick.waveform_id.network_code
    sta = pick.waveform_id.station_code
    trace_params = {
        "station_network_code": net,
        "station_code": sta,
        "trace_channel": pick.waveform_id.channel_code[:2],
        "station_location_code": pick.waveform_id.location_code,
        "evaluation_mode": pick.evaluation_mode}
    return trace_params

In [None]:
def get_phase_params(pick, event):
    origin = event.preferred_origin()
    arrival = src.select_arrival_related_to_the_pick(pick=pick, arrivals=origin.arrivals)
    if arrival:
        phase_params = arrival.__dict__.copy()
        for key in ['resource_id', 'pick_id', 'phase']:
            phase_params.pop(key)
        phase_params = {f'{key}_{pick.phase_hint}':val for key,val in phase_params.items()}
    else:
        phase_params = {}
    return phase_params

In [None]:
def get_picks_time_difference(picks):
    picks_time = [pick.time for pick in picks]
    picks_time = sorted(picks_time)
    picks_difftime = [time-picks_time[0] for time in picks_time]
    return picks_difftime

In [None]:
def reversing_dictionary(dictionary):
    return {v:k for k, v in dictionary.items()}

In [None]:
def auto_labeling(stream, dl_pickers):
    outputs = {'P': {}, 'S': {}}
    for name, picker in dl_pickers.items():
        output = picker.classify(stream)
        picks = output.picks
        creator = output.creator
        for pick in picks:
            outputs[pick.phase][name] = pick.peak_time
    return outputs

In [None]:
import numpy as np
from scipy.stats import skew, zscore

In [None]:
def ztest(array, threshold=3):
    '''
    The outlier detector with Z-score
    '''
    data_skewness = skew(array)
    ###
    z_scores = zscore(array)
    outliers = np.abs(z_scores) > threshold
    # print(f'Number of outliers: {sum(outliers)}')
    logging.info(f'Number of outliers: {sum(outliers)}')
    return array[~outliers]

def window_checking(array, window_len=0.2, min_num_picks=3):
    array.sort()
    s = np.diff(array).sum()
    # print(s)
    logging.info(f'{s}')
    if (s <= window_len) and (min_num_picks<=array.size):
        cond = True
    else:
        cond = False
    return cond

In [None]:
from statistics import median

In [None]:
def iqr(array, multiplier=1.5):
    """
    Interquartile Range
    Tukey Fences are robust methods in detecting outliers.
    As per the Turkey method, the outliers are the points lying
    beyond the upper boundary of Q3 +1.5*IQR and the lower boundary
    of Q1 - 1.5*IQR. These boundaries are referred to as outlier fences.
    Any data beyond these fences are considered to be outliers.

    for some nonnegative constant k. John Tukey proposed this test,
    where k = 1.5 indicates an "outlier", and k = 3 indicates data
    that is "far out".
    (Fig.1 of the Kristekova_etal_GJI_2021.pdf)

    :type values: numpy.ndarray
    :param values: one-dimensional number arrays.
    :type multiplier: float
    :param multiplier: ???

    :returns:
    :type outliers: numpy.ndarray
    :param outliers: A boolean array concerning the size of input `values`.
    :type lower: float
    :param lower: the lower boundary of the fences (or Q1 - 1.5*IQR).
    :type upper: float
    :param upper: the upper boundary of the fences (or Q3 +1.5*IQR).
    """
    values_sorted = sorted(array)
    midpoint = int(round(len(values_sorted) / 2.0))
    q1 = median(values_sorted[:midpoint])
    q3 = median(values_sorted[midpoint:])
    iqr = q3 - q1
    lower = q1 - (iqr * multiplier)
    upper = q3 + (iqr * multiplier)
    inliers_index = (lower <= array) & (array <= upper)
    outliers_index = ~inliers_index
    inliers = array[inliers_index]
    return inliers

In [None]:
def weights_for_dwa(array):
    '''
    The formula 4 in the https://doi.org/10.1093/gji/ggae049 article.
    '''
    if len(array.shape) == 1:
        array = np.expand_dims(array, axis=0)
    weights = 1 / np.abs(array-array.T).sum(axis=1)
    return weights


def dwa(array):
    '''
    The formula 3 in the https://doi.org/10.1093/gji/ggae049 article.
    '''
    weights = weights_for_dwa(array)
    weighted_mean = sum(weights*array) / sum(weights)
    return weighted_mean

In [None]:
def find_optimum_pick_time(times, outlier_detector='Z-score'):
    if outlier_detector=='Z-score':
        times_inlier = ztest(array=times, threshold=1)
    elif outlier_detector=='IQR':
        times_inlier = iqr(array=times, multiplier=0.5)
    # print(times_inlier)
    cond = window_checking(array=times_inlier,
                           window_len=1,
                           min_num_picks=2)
    # print(cond)
    logging.info(f'{cond}')
    if cond:
        time_optimum = dwa(array=times_inlier)
    else:
        time_optimum = np.nan
    # print(times_inlier)
    return time_optimum

# Initializing the init file and starting logging.

In [None]:
init_cfg = srconf.load_config('0-init-cfg.yml')
cfg = srconf.load_config(
    os.path.join(init_cfg.target_config_filepath,
                 init_cfg.target_config_filename)
)
#
today_str = datetime.today().strftime('%Y-%m-%dT%H-%M-%S')
cfg.mk_dataset.path.outputs.dataset = cfg.mk_dataset.path.outputs.dataset.format(datetime_str=today_str)

In [None]:
srconf.configure_logging(level=cfg.log.level,
                         log_format=cfg.log.format,
                         mode=cfg.log.mode, colored_console=True,
                         filepath=cfg.mk_dataset.path.outputs.dataset,
                         filename_prefix=cfg.log.filename_prefix,
                         filename=cfg.mk_dataset.path.outputs.log.filename)

In [None]:
log_separator = "+" * 80

In [None]:
nb_path, nb_name = getting_filename_and_path_of_the_running_code()
msg = (f"Logging has started for notebook: {nb_name}.\n"
       f"This file is located at: {nb_path}\n"
       )
logging.info(msg)
logging.info(f"Separator: {log_separator}")

In [None]:
# List all installed packages and their versions
imported_modules = {name.split('.')[0] for name in globals() if name in sys.modules}
installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}
msg = "Packages List:\n"
for package in sorted(installed_packages.keys()):
    # if package in imported_modules:
    version = installed_packages[package]
    msg += f"{package}=={version}\n"
logging.info(msg)
logging.info(f"Separator: {log_separator}")

In [None]:
msg = cfg.__str__()
logging.info(f'Configuration File:\n{msg}')
logging.info(f"Separator: {log_separator}")

# Loading Seismic Catalog and network details.

In [None]:
catalog = read_events(cfg.mk_dataset.path.inputs.catalog)
catalog = [ev for ev in catalog if ev.picks != []]

In [None]:
network_details = pd.read_csv(cfg.mk_dataset.path.inputs.network_details, dtype=str)
network_details.fillna(value='', inplace=True)
stations_list = network_details['station'].values

In [None]:
d = {'SDHR': {}, 'JIGH': {}}
for ev in catalog:
    for pick in ev.picks:
        if pick.waveform_id.station_code in ('SDHR', 'JIGH'):
            if pick.time.julday in d[pick.waveform_id.station_code].keys():
                d[pick.waveform_id.station_code][pick.time.julday] += 1
            else:
                d[pick.waveform_id.station_code][pick.time.julday] = 1
            # d[pick.waveform_id.station_code].append(pick.time.julday)

In [None]:
otime = [ev.preferred_origin().time.timestamp for ev in catalog]
import matplotlib.pyplot as plt
_ = plt.hist(otime)

In [None]:
src.print_phase_frequency(catalog, case_sensitivity=False)

#### Extracting the event parameters

From the catalog, we extract the event parameters and store them into a dictionary. Here, we only extract a few basic parameters on the source and its magnitude - if available. In addition, we define the split of the dataset into training/development/test partitions. We visualize one example.

#### Extracting the trace parameters

From each pick, we extract parameters about the trace and store them in a dictionary. Again, we only extract very basic parameters. We visualize one example.

In [None]:
i = 0
j = 0
for ev in catalog:
    origin = ev.preferred_origin()
    for pick in ev.picks:
        arrival = src.select_arrival_related_to_the_pick(pick=pick, arrivals=origin.arrivals)
        if arrival==False:
            i += 1
            break
        else:
            j += 1
        # print(arrival, pick, '\n\n', sep='\n')
print(i, j)

#### Writing to SeisBench format

Now, we can combine all the above functions together to write a dataset in SeisBench format. For this, we first need to define the path. For this example, we are using the current working directory. A dataset consists of 2 components:
 - a metadata file, always called `metadata.csv`, which contains all the associated properties of the waveform examples (e.g. trace parameters, source parameters etc.).
 - a waveforms file, always called `waveforms.hdf5`, containing the raw waveforms.

To write the dataset, we use the `WaveformDataWriter` provided by SeisBench. The writer should always be used as a context manager, i.e., using the `with` statement, as shown below. This is to ensure files are properly clsoed after writing and teardown and cleanup operations are always called when exiting the context manager.

First, we need to set the data format for our dataset. We do this by assigning a dictionary to the `writer.data_format` group.

Next, we iterate over all event and all picks in the events. Using the functions above, we generate the event and trace metadata and download the waveforms. We then convert the waveforms to a numpy array using the function `stream_to_array` provided in `seisbench.util`.

As a last step, we hand the event metadata and the waveforms as numpy array over to the writer using `add_trace`. The writer then automatically takes care of writing out the data in the correct format. It also takes care of performance optimisations that we outline in the further considerations below.

In [None]:
get_waveforms = get_data(cfg.mk_dataset.path.inputs.stream_root,
                         cfg.mk_dataset.path.inputs.stream_pattern)

In [None]:
# model = sbm.EQTransformer()
# for n in model.list_pretrained():
#     print(n)
#     try:
#         model.from_pretrained(n)
#     except Exception as error:
#         print(error)

In [None]:
import seisbench.models as sbm
import torch

In [None]:
if cfg.mk_dataset.autolabeling:
    dl_pickers = {'PhaseNet_stead': sbm.PhaseNet.from_pretrained("stead"),
                  'PhaseNet_original': sbm.PhaseNet.from_pretrained("original"),
                #   'PhaseNet_scedc': sbm.PhaseNet.from_pretrained("scedc"),
                #   'PhaseNet_instance': sbm.PhaseNet.from_pretrained("instance"),
                  #
                  'EQTransformer_stead': sbm.EQTransformer.from_pretrained("stead"),
                #   'EQTransformer_original': sbm.EQTransformer.from_pretrained("original"),
                #   'EQTransformer_scedc': sbm.EQTransformer.from_pretrained("scedc"),
                #   'EQTransformer_instance': sbm.EQTransformer.from_pretrained("instance"),
                  #
                #   sbm.GPD.from_pretrained("stead"),
                  'GPD_original': sbm.GPD.from_pretrained("original"),
                #   'GPD_scedc': sbm.GPD.from_pretrained("scedc"),
                #   'GPD_instance': sbm.GPD.from_pretrained("instance"),
    }

    if torch.cuda.is_available():
        for key, dl_picker in dl_pickers.items():
            dl_picker.cuda();
            logging.info(f"{key} Running on GPU")
    else:
        logging.info("Running on CPU")

In [None]:
base_path = Path(cfg.mk_dataset.path.outputs.dataset)
metadata_path = base_path / "metadata.csv"
waveforms_path = base_path / "waveforms.hdf5"
###
if cfg.mk_dataset.save_streams:
    stream_path = base_path / "mseed"; os.makedirs(stream_path, exist_ok=True)
print(metadata_path, waveforms_path, sep='\n')

In [None]:
# Iterate over events and picks, write to SeisBench format
with sbd.WaveformDataWriter(metadata_path, waveforms_path) as writer:

    # Define data format
    writer.data_format = {
        "dimension_order": "CW",
        "component_order": "ZNE",
        "measurement": "velocity",
        "unit": "counts",
        "instrument_response": "not restituted",
    }
    n_all = len(catalog)
    for index, event in enumerate(catalog):
        # if index < 2000:
        #     continue
        if index % 500 == 0:
            logging.info(f'{index} of {n_all} ({index/n_all*100:.2f}%)')
        event_params = get_event_params(event)
        stations_in_event = {pick.waveform_id.station_code for pick in event.picks}
        stations_in_event = {station for station in stations_in_event if station in stations_list}
        for station_name in stations_in_event:
            picks = src.select_picks(picks=event.picks,
                                 station_name=station_name)
            if picks == []:
                continue
            ###
            phase_params = {}
            for pick in picks:
                param = get_phase_params(pick, event)
                phase_params.update(param)
            ###
            time_diff = get_picks_time_difference(picks)
            if max(time_diff) >= 60:
                logging.warning(f'losing pick, maximume is: {max(time_diff)}')
            ###
            pick = picks[0]
            trace_params = get_trace_params(pick)
            waveforms = get_waveforms.get_data_related_to_pick(pick=pick)
            waveforms = waveforms.slice(
                starttime=pick.time-cfg.mk_dataset.cut_time.before,
                endtime=pick.time+cfg.mk_dataset.cut_time.after,
                nearest_sample=True
                )
            ### Check remaining data
            if len(waveforms) == 0:
                # No waveform data available
                # print('There is No WaveForms After Slicing!!!')
                logging.warning(f'There is No WaveForms After Slicing!!! [station: {station_name}]')
                continue
            ###
            if (cfg.mk_dataset.noisepad.before!=0) or (cfg.mk_dataset.noisepad.after!=0):
                waveforms = srw.st_noise_padding(
                    st=waveforms,
                    stime=cfg.mk_dataset.noisepad.before,
                    etime=cfg.mk_dataset.noisepad.after,
                    std_windows=(cfg.mk_dataset.noisepad.std_start, cfg.mk_dataset.noisepad.std_end)
                    )
            ###
            # Check that the traces have the same sampling rate
            srw.waveform.uni_sps(st=waveforms, )
            sampling_rate = waveforms[0].stats.sampling_rate
            number_of_samples = waveforms[0].data.size
            actual_t_start, data, _ = sbu.stream_to_array(
                waveforms,
                component_order=writer.data_format["component_order"],
            )
            #
            trace_params[f"trace_sample_number"] = number_of_samples
            trace_params["trace_sampling_rate_hz"] = sampling_rate
            trace_params["trace_start_time"] = str(actual_t_start)
            ###
            gaps = [0, 0, 0]
            for gap_detail in waveforms.get_gaps():
                if gap_detail[3].endswith('Z'):
                    gaps[0] += gap_detail[-1]
                elif gap_detail[3].endswith('N'):
                    gaps[1] += gap_detail[-1]
                elif gap_detail[3].endswith('E'):
                    gaps[2] += gap_detail[-1]
            trace_params[f"trace_gaps"] = gaps
            ###
            trace_params["trace_Q1"] = [np.percentile(_data, 25) for _data in data]
            trace_params["trace_Q3"] = [np.percentile(_data, 75) for _data in data]
            ###

            for pick in picks:
                sample = (pick.time - actual_t_start) * sampling_rate
                sample = round(sample)
                phase_params[f"trace_{pick.evaluation_mode}_{pick.phase_hint}_arrival_sample"] = sample
                trace_params[f"trace_{pick.phase_hint}_snr"] = srw.health_check.routine.compute_snr(
                    data=data, pick_idx=sample, noise_window=100, signal_window=200, axis=1, domain='time')
                trace_params[f"trace_{pick.phase_hint}_snr-dB"] = 20 * np.log10(trace_params[f"trace_{pick.phase_hint}_snr"])
            # trace_params[f"trace_mean"] = np.mean(data, axis=1)
            trace_params[f"trace_rms"] = np.sqrt(np.mean(np.power(data, 2), axis=1))
            trace_params[f"trace_median"] = np.median(data, axis=1)
            trace_params[f"trace_max"] = np.max(data, axis=1)
            trace_params[f"trace_min"] = np.min(data, axis=1)
            ###
            if cfg.mk_dataset.save_streams:
                otime = event_params['source_origin_time'].replace('-', '').replace(':', '')[:-5]
                waveforms.write(stream_path/f'{index}_{otime}_{station_name}.msd', format='MSEED')
            ### Auto Labeling
            if cfg.mk_dataset.autolabeling:
                stime = min([tr.stats.starttime for tr in waveforms])
                etime = max([tr.stats.endtime for tr in waveforms])
                waveforms.taper(0.2)
                ######################################################################
                waveforms.trim(
                    starttime=stime-(60-cfg.mk_dataset.cut_time.before),
                    endtime=etime+(60-cfg.mk_dataset.cut_time.after),
                    pad=True, fill_value=0)
                ##
                # waveforms_padded = srw.st_noise_padding(
                #     st=waveforms,
                #     stime=60-cfg.cut_time.before,
                #     etime=60-cfg.cut_time.after,
                #     std_windows=(cfg.noisepad.std_start, cfg.noisepad.std_end))
                waveforms_padded = waveforms
                ######################################################################
                auto_label = auto_labeling(stream=waveforms_padded, dl_pickers=dl_pickers)
                for phase_hint, auto_picks in auto_label.items():
                    for picker_dataset_name, picker_time in auto_picks.items():
                        # print(phase_hint, picker_dataset_name, picker_time)
                        sample = (picker_time - actual_t_start) * sampling_rate
                        phase_params[f"trace_autoDL_{picker_dataset_name}_{phase_hint}_arrival_sample"] = int(sample)
            ####
            writer.add_trace({**event_params, **trace_params, **phase_params}, data)
        # break

In [None]:
auto_label = auto_labeling(stream=waveforms, dl_pickers=dl_pickers)
for phase_hint, auto_picks in auto_label.items():
    for picker_dataset_name, picker_time in auto_picks.items():
        logging.info(f'{phase_hint}\n{picker_dataset_name}\n{picker_time}')
        # print(phase_hint, picker_dataset_name, picker_time)
        sample = (picker_time - actual_t_start) * sampling_rate
        trace_params[f"trace_{phase_hint}_arrival_sample_autoDL_{picker_dataset_name}"] = int(sample)

## Considerations for converting datasets

As outlined above, this tutorial provides a very minimal example on converting a dataset. Here we outline additional consideration that should be taken into account when preparing a dataset.

- **Grouping picks**: In this example, we created one trace for each pick. Naturally, traces will overlap if multiple picks, e.g., P and S phases, are available for an event at a station. For an example implementation of this grouping operation, have a look [here](https://github.com/seisbench/seisbench/blob/df94dcd86ce66d6a2ee2bd00da3857259fe579bd/seisbench/data/ethz.py#L109) and in the subsequent lines.
- **Adding station information**: In this example, we added no station information except its name. In practice, it will often be helpful for users to incorporate, for example, the location of the station. We skipped this step here, because it requires loading station inventories through FDSN. For an example implementation, have a look [here](https://github.com/seisbench/seisbench/blob/df94dcd86ce66d6a2ee2bd00da3857259fe579bd/seisbench/data/ethz.py#L315).
- **Memory requirements**: Internally, the `WaveformDataWriter` writes out the the waveforms continuously in blocks (see point below), but keeps all metadata in memory until the dataset is complete. For very large datasets (or very detailed metadata) this can result in several gigabytes of memory consumption. If you are writing such datasets, make sure the available memory on your machine is sufficient.
- **Waveform blocks**: Instead of writing each waveform separately, waveforms are written out in blocks. This massively improves IO performance. Have a look at [the documentation](https://seisbench.readthedocs.io/en/stable/pages/data_format.html#traces-blocks) for details on the strategy. We expect that in nearly all cases using the default setting will be a good choice.
- **FDSN considerations**: When converting very large datasets, the performance might be limited by the performance of the FDSN webservice. Unfortunately, downloading lots of short waveforms (as required for many machine learning applications) does not seem to be the most favorable use case for FDSN. This leads to rather slow performance when naively downloading the waveforms as outlined above. Instead, it is often helpful to issue [bulk requests](https://docs.obspy.org/master/packages/autogen/obspy.clients.fdsn.client.Client.get_waveforms_bulk.html). In addition, it might be a good choice to first download the waveforms and cache them locally, for example, in .mseed format, and then convert them to SeisBench.

For further details on the data format, check out [the data format specification in the SeisBench documentation](https://seisbench.readthedocs.io/en/stable/pages/data_format.html#traces-blocks).