# Base module

In [None]:
from importlib import reload
import os
from datetime import timedelta

import numpy as np

import h5py

from obspy.clients.filesystem.sds import Client
from obspy.clients.fdsn import RoutingClient
from obspy.core import UTCDateTime as UTC
from obspy.signal import util

import matplotlib.pyplot as plt
plt.style.use('tableau-colorblind10')

#from data_quality_control import processing
from data_quality_control import base

In [None]:
base.logger.setLevel("INFO")

# Create ProcessingParameters

Settings for amplitude and psd computation are handled 
via the ProcessingParamters class.

If using a batch processing class, you can either pass a
ProcessingParameters object (keyword `procparams`) or set
each parameter directly as keyword argument.

## Use default parameters

In [None]:
#reload(base)
pp = base.ProcessingParameters()
print(pp)

Of course, we can also pass them explicitely.

In [None]:
#reload(base)
pp = base.ProcessingParameters(**base.default_processing_params)
print(pp)

## Custom parameters

Change all parameters.

In [None]:
#reload(base)
pp2 = base.ProcessingParameters(overlap=300, 
                                     amplitude_frequencies=(1, 10),
                                     nperseg=1024,
                                     winlen_seconds=1800,
                                     proclen_seconds=3600*24)
print(pp2)

If we only want to change some default settings, we only need to
specify those.

In [None]:
#reload(base)
pp3 = base.ProcessingParameters(overlap=300)
print(pp3)

# NSCProcessing

The NSCProcessor is another low-level class. It does the actual 
work of extracting the amplitude
information and computing the power spectral densities from the
seismic data.

It is usually called by the batch processing classes.

In [None]:
network = 'GR'
station = 'BFO'
location = ''
channel = 'HHZ'

outdir = '.'

sds_root = os.path.abspath('../sample_sds/')
inventory_routing_type = "eida-routing"

In [None]:
sdsclient = Client(sds_root)
invclient = RoutingClient(inventory_routing_type)

sdsclient.get_all_nslc()

We need to give network, station, location and channel. These
are passed to the `get_waveforms()` and `get_inventory()` methods
of the obspy-clients. Here, we use the sds-client for the data
and the Eida-RoutingClient for the inventory.
The processing parameters are handled in an attributed ProcessingParameter-object. They can be set via the corresponding
keywords.

If
- no keywords are given, default processing parameters are used
- with keyword `procparams` a ProcessingParameters-object can be passed
- additional keywords override default values of those in `procparams`


In [None]:
#reload(base)
nscp = base.NSCProcessor(network, station, channel,location, 
                               sdsclient, invclient, procparams=pp,
                               overlap=30)
print(nscp)

The actual processing is started by calling the method `process()`, 
which needs a start and end time.

In [None]:
startdate = UTC("2020-12-20")
enddate = UTC("2021-01-15")

In [None]:
output = nscp.process(startdate, enddate)

The result is an object, which bundles the results (amplitudes,
spectra, frequency axis) with the time range that was processed
and the processing parameters

In [None]:
print(output)

In [None]:
output.plot_amplitudes();

From the error log, we can see that there wasn't data for all days.
Most of the missing days in the data base at the beginning of the
time range were already dismissed from the output automatically. 
Nevertheless, there is a single empty day at the beginning. This is
because due to the small overlap, that we require between processing
units (`proclen`, days usually), the algorithm looks for data in a time span, that is slightly larger than the one we aim for. So it finds data from the next processing unit (25-Dec) and therefore the
processing is triggered, even though we discard the results in the end.

However, once data was successfully processed (even if resulted in 
Nans only), the algorithm keeps adding data until it reaches the end
of the time range. If no data is found, an array of Nans is added in
corresponding shape. This is because the algorithm cannot know whether
it will eventually be more data or not. 

We know this only at the end. Then, the method `trim_nan()` can be
used to remove empty slices and adjust the start and endtime
accordingly.

In [None]:
output.trim_nan()

In [None]:
print(output)

In [None]:
output.plot_amplitudes();
plt.suptitle("Amplitude data matrix for trimmed data");
plt.xlabel('windows (here: ~hours)')
plt.ylabel('proclen (here ~days)');

The output-class can dump its content to an HDF5 file.

In [None]:
output.to_file()

Read the file again from disk.

In [None]:
data = base.BaseProcessedData().from_file(
    "GR.BFO..HHZ_2020-12-25_2021-01-09.hdf5")

In [None]:
data

In [None]:
data.plot_amplitudes();

In [None]:
data.plot_psds(func=np.log);
plt.suptitle("logarithmic PSDs");

# Batch processing

The GenericProcessing class provides a workflow that processed
a large time range of data at once. The results are written to
HDF5 files.

The GenericProcessing class is intended to provide low-level 
functionality. It can be customized to different data base
layouts and corresponding data clients, e.g. SDS file system.
This can result in a simpler interface and a more generic way of
providing NSLC information, e.g. using lists or wildcards.

At the moment, we take NSCL only as finite strings.

Data is processed in batches of years, months, days, or hours
(`fileunit`) with one output file per batch.

In [None]:
overlap = 60 #3600
fmin, fmax = (4, 14)
nperseg = 2048
winlen_in_s = 3600
proclen = 24*3600

In [None]:
#reload(base)
pp = base.ProcessingParameters(overlap=overlap, 
                             amplitude_frequencies=(fmin, fmax),
                             nperseg=nperseg,
                             winlen_seconds=winlen_in_s,
                             proclen_seconds=proclen)
print(pp)

## Year-wise files

In [None]:
#reload(base)
proc = base.GenericProcessor(network, station, location, channel, 
                             sdsclient, invclient, fileunit="year",
                             procparams=pp)

proc

In [None]:
proc.expand_nslc()
print(proc._networks)

In [None]:
proc.process(startdate, enddate, force_new_file=True)

### Results

In [None]:
res = base.BaseProcessedData().from_file("GR.BFO..HHZ_2020.hdf5")

res

In [None]:
res.trim_nan()
res

## Monthly

First, we use the same processing parameters as before

In [None]:
#reload(base)
proc = base.GenericProcessor(network, station, location, channel, 
                             sdsclient, invclient, fileunit="month",
                             procparams=pp)

proc

In [None]:
proc.process(startdate, enddate, force_new_file=True)

### Results

The shape of the data in the files indicates that there are only
31 days of data (the number of days in December).

The filename includes the month now.

In [None]:
res = base.BaseProcessedData().from_file("GR.BFO..HHZ_2020-12.hdf5")
res

We can again trim of the Nan data, which results in the same
dimensions as for the year-batches because the data base was 
the same as before. It only contains data for the last 7 days of
Dec-2020.

In [None]:
res.trim_nan()
res

In [None]:
res.plot_amplitudes();

# Compare

In [None]:
res1 = base.BaseProcessedData().from_file("GR.BFO..HHZ_2020.hdf5")
res1.trim_nan()
print(res1)

In [None]:
res2 = base.BaseProcessedData().from_file("GR.BFO..HHZ_2020-12.hdf5")
res2.trim_nan()
print(res2)

Compare the data arrays of the two files

In [None]:
np.all(np.isclose(res2.amplitudes, res1.amplitudes, equal_nan=True))

In [None]:
np.all(np.isclose(res2.psds, res1.psds, equal_nan=True))

## Different window length

In [None]:
winlen_in_s = 1800

In [None]:
#reload(base)
pp = base.ProcessingParameters(overlap=overlap, 
                             amplitude_frequencies=(fmin, fmax),
                             nperseg=nperseg,
                             winlen_seconds=winlen_in_s,
                             proclen_seconds=proclen)
print(pp)

In [None]:
#reload(base)
proc = base.GenericProcessor(network, station, location, channel, 
                             sdsclient, invclient, fileunit="month",
                             procparams=pp,)

proc

In [None]:
proc.process(startdate, enddate, force_new_file=True)

### Results

The 2nd axis of the data matrix has now 48 entries because we 
used half the size of the processing window. The resulting
amplitude data look similar in structure than before which is 
good. 

In [None]:
res = base.BaseProcessedData().from_file("GR.BFO..HHZ_2020-12.hdf5")
res

In [None]:
res.trim_nan()
res

In [None]:
res.plot_amplitudes();

## Different processing length

In [None]:
winlen_in_s = 1800
proclen = 12*3600

In [None]:
#reload(base)
pp = base.ProcessingParameters(overlap=overlap, 
                             amplitude_frequencies=(fmin, fmax),
                             nperseg=nperseg,
                             winlen_seconds=winlen_in_s,
                             proclen_seconds=proclen)
print(pp)

In [None]:
reload(base)
proc = base.GenericProcessor(network, station, location, channel, 
                             sdsclient, invclient, fileunit="month",
                             procparams=pp,)

proc

In [None]:
proc.process(startdate, enddate, force_new_file=True)

In [None]:
res = base.BaseProcessedData().from_file("GR.BFO..HHZ_2020-12.hdf5")
res

In [None]:
res.trim_nan()
res

In [None]:
res.plot_amplitudes();

In [None]:
res = base.BaseProcessedData().from_file("GR.BFO..HHZ_2021-01.hdf5")
res

In [None]:
res.trim_nan()
res

In [None]:
res.plot_amplitudes();