In [None]:
import os
import sys
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.width', 240)
pd.__version__

In [None]:
import datetime
import pytz
import matplotlib
import matplotlib.dates
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (16.0, 9.0)
matplotlib.rcParams['figure.max_open_warning'] = 100
matplotlib.__version__

In [None]:
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [None]:
# Progress bar helper to indicate that slow tasks have not stalled
from tqdm.auto import tqdm

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
PICKS_PATH = r"C:\data_cache\Picks\20190320\ensemble.p.txt"
#PICKS_PATH = r"C:\data_cache\Picks\20190219\ensemble_small.p.txt"
dtype = {'#eventID': object,
    'originTimestamp': np.float64,
    'mag':                    np.float64,
    'originLon':              np.float64,
    'originLat':              np.float64,
    'originDepthKm':          np.float64,
    'net':                     object,
    'sta':                     object,
    'cha':                     object,
    'pickTimestamp':          np.float64,
    'phase':                   object,
    'stationLon':             np.float64,
    'stationLat':             np.float64,
    'az':                     np.float64,
    'baz':                    np.float64,
    'distance':               np.float64,
    'ttResidual':             np.float64,
    'snr':                    np.float64,
    'qualityMeasureCWT':      np.float64,
    'domFreq':                np.float64,
    'qualityMeasureSlope':    np.float64,
    'bandIndex':              np.int64,
    'nSigma':                 np.int64}

In [None]:
def utc_time_string_to_plottable_datetime(utc_timestamp_str):
    """
    Convert a UTC timestamp string to datetime type that is plottable by matplotlib

    :param utc_timestamp_str: ISO-8601 UTC timestamp string
    :type utc_timestamp_str: str
    :return: Plottable datetime value
    :rtype: datetime.datetime with tzinfo
    """
    utc_time = obspy.UTCDateTime(utc_timestamp_str)
    return pytz.utc.localize(datetime.datetime.utcfromtimestamp(float(utc_time)))

In [None]:
TEMP_DEPLOYMENTS = {}
TEMP_DEPLOYMENTS['7X'] = (utc_time_string_to_plottable_datetime('2009-06-16T03:42:00.000000Z'),
                          utc_time_string_to_plottable_datetime('2011-04-01T23:18:49.000000Z'),
                          'C7', 'Deployment 7X')
TEMP_DEPLOYMENTS['7D'] = (utc_time_string_to_plottable_datetime('2012-01-01T00:01:36.000000Z'),
                          utc_time_string_to_plottable_datetime('2014-03-27T15:09:51.000000Z'),
                          'C1', 'Deployment 7D')
TEMP_DEPLOYMENTS['7F'] = (utc_time_string_to_plottable_datetime('2012-12-31T23:59:59.000000Z'),
                          utc_time_string_to_plottable_datetime('2014-11-15T00:43:14.000000Z'),
                          'C3', 'Deployment 7F')
TEMP_DEPLOYMENTS['7G'] = (utc_time_string_to_plottable_datetime('2014-01-01T00:00:06.000000Z'),
                          utc_time_string_to_plottable_datetime('2016-02-09T21:04:29.000000Z'),
                          'C4', 'Deployment 7G')
TEMP_DEPLOYMENTS['OA'] = (utc_time_string_to_plottable_datetime('2017-09-13T23:59:13.000000Z'),
                          utc_time_string_to_plottable_datetime('2018-11-28T01:11:14.000000Z'),
                          'C8', 'Deployment OA')

In [None]:
IRIS_AU_STATIONS_FILE = r"C:\software\hiperseis\seismic\gps_corrections\AU_irisws-fedcatalog_20190305T012747Z.txt"

In [None]:
df_raw_picks = pd.read_csv(PICKS_PATH, ' ', header=0, dtype=dtype)
len(df_raw_picks)

## Generate catalog of major regional events (mag 8+) for overlays

In [None]:
if True:
    df_mag8 = df_raw_picks[df_raw_picks['mag'] >= 8.0]
    df_mag8['day'] = df_mag8['originTimestamp'].transform(datetime.datetime.utcfromtimestamp).transform(lambda x: x.strftime("%Y-%m-%d"))
    df_mag8 = df_mag8.sort_values(['day', 'originTimestamp'])

    day_mag8_count = [(day, len(df_day)) for day, df_day in df_mag8.groupby('day')]
    dates, counts = zip(*day_mag8_count)
    mag8_dict = {'date': dates, 'counts': counts}
    mag8_events_df = pd.DataFrame(mag8_dict, columns=['date', 'counts'])

    event_count_threshold = 400
    significant_events = mag8_events_df[mag8_events_df['counts'] >= event_count_threshold]
    significant_events = significant_events.set_index('date')

    significant_events.loc['2001-06-23', 'name'] = '2001 South Peru Earthquake'
    significant_events.loc['2001-11-14', 'name'] = '2001 Kunlun earthquake'
    significant_events.loc['2002-11-03', 'name'] = '2002 Denali earthquake'
    significant_events.loc['2003-09-25', 'name'] = '2003 Tokachi-Oki earthquake'
    significant_events.loc['2004-12-26', 'name'] = '2004 Indian Ocean earthquake and tsunami'
    significant_events.loc['2005-03-28', 'name'] = '2005 Nias-Simeulue earthquake'
    significant_events.loc['2009-09-29', 'name'] = '2009 Samoa earthquake and tsunami'
    significant_events.loc['2010-02-27', 'name'] = '2010 Chile earthquake'
    significant_events.loc['2011-03-11', 'name'] = '2011 Tohoku earthquake and tsunami'
    significant_events.loc['2012-04-11', 'name'] = '2012 Indian Ocean earthquakes'
    significant_events.loc['2013-02-06', 'name'] = '2013 Solomon Islands earthquakes'
    significant_events.loc['2013-09-24', 'name'] = '2013 Balochistan earthquakes'
    significant_events.loc['2014-04-01', 'name'] = '2014 Iquique earthquake'
    significant_events.loc['2015-09-16', 'name'] = '2015 Illapel earthquake'
    significant_events.loc['2016-08-24', 'name'] = '2016 Myanmar earthquake'

    display(significant_events)

In [None]:
# Query time period for source dataset
import obspy

print(obspy.UTCDateTime(df_raw_picks['originTimestamp'].min()))
print(obspy.UTCDateTime(df_raw_picks['originTimestamp'].max()))

In [None]:
# raw_AU_mask = (df_raw_picks['net'] == 'AU')
# min_AU_date = df_raw_picks.loc[raw_AU_mask, 'originTimestamp'].min()
# max_AU_date = df_raw_picks.loc[raw_AU_mask, 'originTimestamp'].max()
# obspy.UTCDateTime(min_AU_date), obspy.UTCDateTime(max_AU_date)

In [None]:
import datetime
# datetime.datetime.fromtimestamp(0.0, datetime.timezone.utc)

In [None]:
# Priority order of trusted channels
# channel_pref = ['HHZ', 'HHZ_10', 'H?Z', 'BHZ_00', 'BHZ', 'BHZ_10', 'B?Z', 'S?Z', 'SHZ', '???', '?']
channel_pref = ['HHZ', 'HHZ_10', 'H?Z', 'BHZ_00', 'BHZ', 'BHZ_10', 'B?Z', 'S?Z', 'SHZ']
# channel_pref = ['HHZ', 'HHZ_10', 'H?Z', 'BHZ_00', 'BHZ', 'BHZ_10', 'B?Z']

In [None]:
# Remove non-BHZ channels as their picks are not considered reliable enough to use
df_picks = df_raw_picks[df_raw_picks['cha'].isin(channel_pref)].reset_index()
print(obspy.UTCDateTime(df_picks['originTimestamp'].min()))
print(obspy.UTCDateTime(df_picks['originTimestamp'].max()))
len(df_picks)

In [None]:
# Remove unused columns for readability
df_picks = df_picks[['#eventID', 'originTimestamp', 'mag', 'originLon', 'originLat', 'originDepthKm', 'net', 'sta', 'cha', 'pickTimestamp', 'phase', 
                     'stationLon', 'stationLat', 'az', 'baz', 'distance', 'ttResidual', 'snr', 'qualityMeasureCWT', 'qualityMeasureSlope', 'nSigma']]
print(obspy.UTCDateTime(df_picks['originTimestamp'].min()))
print(obspy.UTCDateTime(df_picks['originTimestamp'].max()))

In [None]:
if True:
    # Some select stations require custom date filters to remove singular events outside the date range of the rest of the network
    DATE_FILTER = (
        ('7D', pd.Timestamp(datetime.datetime(2010, 1, 1))), 
        ('7G', pd.Timestamp(datetime.datetime(2010, 1, 1)))
    )
    # print(DATE_FILTER)
    before = len(df_picks)
#     print(before)
    for net, min_date in DATE_FILTER:
#         print(net + "." + stn + ": " + str(min_date))
        date_mask = (df_picks['net'] == net) & (df_picks['originTimestamp'] < min_date.timestamp())
#         print(np.sum(date_mask))
        df_picks = df_picks[~date_mask]
    after = len(df_picks)
    print('Removed {} events due to timestamps'.format(before - after))

In [None]:
def getNetworkStations(df, netcode):
    return sorted(df[df['net'] == netcode]['sta'].unique().tolist())

In [None]:
def getNetworkMean(df, netcode):
    mean_lat = df[df['net'] == netcode]['stationLat'].mean()
    mean_lon = df[df['net'] == netcode]['stationLon'].mean()
    return (mean_lat, mean_lon)

In [None]:
def getNetworkDateRange(df, netcode):
    mask = (df['net'] == netcode)
    df_net = df.loc[mask]
    min_date = df_net['originTimestamp'].min()
    max_date = df_net['originTimestamp'].max()
    return (obspy.UTCDateTime(min_date), obspy.UTCDateTime(max_date))

def getStationDateRange(df, netcode, statcode):
    mask = (df['net'] == netcode)
    df_net = df.loc[mask]
    mask = (df_net['sta'] == statcode)
    df_sta = df_net.loc[mask]
    min_date = df_sta['originTimestamp'].min()
    max_date = df_sta['originTimestamp'].max()
    return (obspy.UTCDateTime(min_date), obspy.UTCDateTime(max_date))

In [None]:
def getOverlappingDateRange(df, ref_station, target_network):
    mask_ref = df[list(ref_station)].isin(ref_station).all(axis=1)
    mask_targ = df[list(target_network)].isin(target_network).all(axis=1)
    mask = mask_ref | mask_targ
    if not np.any(mask):
        return (None, None)
    df_nets = df.loc[mask]
    keep_events = [e for e, d in df_nets.groupby('#eventID') if np.any(d[list(ref_station)].isin(ref_station).all(axis=1)) and np.any(d[list(target_network)].isin(target_network).all(axis=1))]
    event_mask = df_nets['#eventID'].isin(keep_events)
    df_nets = df_nets[event_mask]
    return (obspy.UTCDateTime(df_nets['originTimestamp'].min()), obspy.UTCDateTime(df_nets['originTimestamp'].max()))

In [None]:
def getIrisStationCodes(src_file, original_network):
    # Get station codes listed in IRIS whose network is original_network.
    # We need this to ensure we get complete coverage of chosen network, as it is
    # possible some such codes appear only under other network codes such as IR, GE, etc.. in the event catalog.
    # Returns a Pandas dataframe consisting of each station code and its mean (latitude, longitude) position.
    df = pd.read_csv(src_file, header=0, sep='|')
    df.columns = [c.strip() for c in df.columns.tolist()]
    au_net_df = df.loc[(df['Network'] == original_network)]
    au_net_df.columns = [c.strip() for c in au_net_df.columns.tolist()]
    au_perm_stations = sorted(au_net_df['Station'].unique())
    mean_lat = []
    mean_lon = []
    for sta in au_perm_stations:
        mean_lat.append(au_net_df.loc[(au_net_df['Station'] == sta), 'Latitude'].mean())
        std_dev = au_net_df.loc[(au_net_df['Station'] == sta), 'Latitude'].std(ddof=0)
        assert std_dev < 1.0, "{}: {}".format(sta, std_dev)
        mean_lon.append(au_net_df.loc[(au_net_df['Station'] == sta), 'Longitude'].mean())
        std_dev = au_net_df.loc[(au_net_df['Station'] == sta), 'Longitude'].std(ddof=0)
        assert std_dev < 1.0, "{}: {}".format(sta, std_dev)
    df_dict = {'sta': au_perm_stations, 'lat': mean_lat, 'lon': mean_lon}
    result_df = pd.DataFrame(df_dict)
    return result_df.set_index(['sta'])

In [None]:
def determineAlternateMatchingCodes(df, iris_file, original_network):
    # Find stations from other networks in df with the same station codes, but different network codes,
    # whose positions match the stations of the same code in the original network.
    matching_network_stn_iris_df = getIrisStationCodes(iris_file, original_network)

    mask_iris_stns = df['sta'].isin(matching_network_stn_iris_df.index)
    mask_not_orig = (df['net'] != original_network)
    df_orig_stns_codes = df.loc[mask_iris_stns & mask_not_orig]

    # For each non-original network record, compute its distance from the known corresponding original station location
    from obspy.geodetics import locations2degrees
    def distToOrigStn(row, orig_df):
        row_sta = row['sta']
        orig_df_sta = orig_df.loc[row_sta]
        return locations2degrees(row['stationLat'], row['stationLon'], orig_df_sta['lat'], orig_df_sta['lon'])
    print("Computing distances to original network station locations...")
    distances_from_orig = df_orig_stns_codes.apply(lambda r: distToOrigStn(r, matching_network_stn_iris_df), axis=1)

    df_orig_stns_codes_matching = df_orig_stns_codes.loc[(distances_from_orig < 1.0)]

    new_codes = [(n, s) for (n, s), _ in df_orig_stns_codes_matching.groupby(['net', 'sta'])]
    new_nets, new_stas = zip(*new_codes)
    return new_nets, new_stas

In [None]:
# getNetworkMean(df_picks, '7B')

In [None]:
#---
# TARGET_NET = 'AU'
# STN_LIST = ['KDU']
#---
# TARGET_NET = 'AU'
# STN_LIST = ['WR0', 'WR1', 'WR2', 'WR3', 'WR4', 'WR5', 'WR6', 'WR7','WR8', 'WR9', 'WR10']
#---
# TARGET_NET = '7X'
# STN_LIST = ['MA01', 'MA33', 'MA41', 'MA42', 'MA43', 'MA44', 'MA51', 'MA62', 'MIL7']
#---
# TARGET_NET = '7D'
# STN_LIST = getNetworkStations(df_picks, TARGET_NET)
# STN_LIST = STN_LIST[0:16] # take a subset
#---
# TARGET_NET = '7G'
# STN_LIST = getNetworkStations(df_picks, TARGET_NET)
# STN_LIST = STN_LIST[0:22] # take a 1/3 subset
#---
# TARGET_NET = '7B'
# STN_LIST = getNetworkStations(df_picks, TARGET_NET)
# STN_LIST = STN_LIST[0:16] # take a subset
#---
TARGET_NET = 'AU'
STN_LIST = getNetworkStations(df_picks, TARGET_NET)
# STN_LIST = STN_LIST[0:16] # take a subset
#---
# TARGET_NET = 'AU'
# STN_LIST = ['ARMA']
#---
TARGET_STNS = {'net': [TARGET_NET]*len(STN_LIST), 'sta': [s for s in STN_LIST]}

getNetworkDateRange(df_picks, TARGET_NET)

In [None]:
# Search for additional codes in non-AU networks and add them to the target stations
new_nets, new_stas = determineAlternateMatchingCodes(df_picks, IRIS_AU_STATIONS_FILE, TARGET_NET)

In [None]:
mask_non_AU = ((df_picks['net'].isin(list(new_nets))) & (df_picks['sta'].isin(list(new_stas))))
min_non_AU_date = df_picks.loc[mask_non_AU, 'originTimestamp'].min()
max_non_AU_date = df_picks.loc[mask_non_AU, 'originTimestamp'].max()
obspy.UTCDateTime(min_non_AU_date), obspy.UTCDateTime(max_non_AU_date)

In [None]:
TARGET_STNS['net'].extend(list(new_nets))
TARGET_STNS['sta'].extend(list(new_stas))
getNetworkDateRange(df_picks, TARGET_NET)

In [None]:
#---
# REF_NET = 'AU'
# REF_STN = 'MTN'
#---
# REF_NET = 'IR'
# REF_STN = 'WRAB'
#---
# REF_NET = 'AU'
# REF_STN = 'QIS' # Doesn't have much BHZ data
#---
# REF_NET = 'AU'
# REF_STN = 'ARMA'
#---
# REF_NET = 'AU'
# REF_STN = 'CMSA'
#---
# REF_NET = 'AU'
# REF_STN = 'QLP'
#---
# REF_NET = 'AU'
# REF_STN = 'QIS' # QIS, CTA, QLP, TOO, WB2, WR0, WR2, HTT, ARMA, CMSA
#---
REF_NET = '7X'
REF_STN = 'MA22' # MA51
#---
# REF_NET = '7D'
# REF_STN = 'DA44' # CZ40
#---
REF = {'net': [REF_NET], 'sta': [REF_STN]}

getStationDateRange(df_picks, REF_NET, REF_STN)

In [None]:
# print(getOverlappingDateRange(df_raw_picks, REF, TARGET_STNS))
# print(getOverlappingDateRange(df_picks, REF, TARGET_STNS))

In [None]:
def display_styled_table(df):
    # Display table with blocks of same event ID highlighted
    df['lastEventID'] = df['#eventID'].shift(1)
    df['lastEventID'].iloc[0] = df['#eventID'].iloc[0]
    cols = ['#ffffff', '#e0e0ff']
    def block_highlighter(r):
        if r['lastEventID'] != r['#eventID']:
            block_highlighter.current_col = (block_highlighter.current_col + 1) % len(cols)
        return ['background-color: ' + cols[block_highlighter.current_col]]*len(r)
    block_highlighter.current_col = 0
    return df.style.apply(block_highlighter, axis=1)

## Filter to teleseismic events

In [None]:
# Column heading for the angular distance (degrees) between event and station
ANG_DIST = 'distance'
mask_tele = (df_picks[ANG_DIST] >= 30.0) & (df_picks[ANG_DIST] <= 90.0)
df_tele = df_picks.loc[mask_tele]
len(df_tele)

## Filter to signal quality metrics

In [None]:
APPLY_QUALITY_TO_REF = True
# Remove reference station records where the SNR is too low
min_ref_snr = 10
# min_ref_snr = 0
mask_snr = (df_tele['snr'] >= min_ref_snr)

# Filter to constrained quality metrics
# cwt_cutoff = 10
# slope_cutoff = 2
# nsigma_cutoff = 4
cwt_cutoff = 15
slope_cutoff = 3
nsigma_cutoff = 4
# cwt_cutoff = 0
# slope_cutoff = 0
# nsigma_cutoff = 0
mask_cwt = (df_tele['qualityMeasureCWT'] >= cwt_cutoff)
mask_slope = (df_tele['qualityMeasureSlope'] >= slope_cutoff)
mask_sigma = (df_tele['nSigma'] >= nsigma_cutoff)

# For events from ISC catalogs the quality metrics are zero, so we use event magnitude instead.
min_magnitude = 5.5
# min_magnitude = 4.0
# min_magnitude = 0.0
mask_zero_quality_stats = (df_tele[['snr', 'qualityMeasureCWT', 'qualityMeasureSlope', 'nSigma']] == 0).all(axis=1)
mask_origin_mag = (df_tele['mag'] >= min_magnitude)

quality_mask = (mask_snr & mask_cwt & mask_slope & mask_sigma) | (mask_zero_quality_stats & mask_origin_mag)
if APPLY_QUALITY_TO_REF:
    # But never apply quality mask to ref stations that have all zero quality stats, as we just can't judge quality
    # and don't want to arbitrarily exclude them.
    mask_ref = df_tele[list(REF)].isin(REF).all(axis=1)
    quality_mask = (mask_zero_quality_stats & mask_ref) | quality_mask
else:
    # Only apply quality mask stations that are not the reference station, i.e. use all ref station events
    # regardless of pick quality at the ref station. This gives more results, but possibly more noise.
    mask_ref = df_tele[list(REF)].isin(REF).all(axis=1)
    quality_mask = mask_ref | (~mask_ref & quality_mask)

assert np.sum(quality_mask) > 100, 'Not enough points left after quality filtering'

In [None]:
df_qual = df_tele[quality_mask]
# df_qual = df_tele
len(df_qual)

## Filter to desired ref and target networks

In [None]:
mask_ref = df_qual[list(REF)].isin(REF).all(axis=1)
mask_targ = df_qual[list(TARGET_STNS)].isin(TARGET_STNS).all(axis=1)
mask = mask_ref | mask_targ
np.any(mask)

In [None]:
df_nets = df_qual.loc[mask]
len(df_nets)

In [None]:
# Filter out events in which REF and TARGET stations are not both present
keep_events = [e for e, d in df_nets.groupby('#eventID') if np.any(d[list(REF)].isin(REF).all(axis=1)) and
               np.any(d[list(TARGET_STNS)].isin(TARGET_STNS).all(axis=1))]
len(keep_events)

In [None]:
event_mask = df_nets['#eventID'].isin(keep_events)
df_nets = df_nets[event_mask]
print(len(df_nets))
assert len(df_nets) > 0, "No events left to analyze!"

In [None]:
# Display first few filtered entries
#display_styled_table(df_nets[df_nets['#eventID'].isin(keep_events[0:5])])

In [None]:
# Alias for dataset at the end of all filtering, a static name that can be used from here onwards.
ds_final = df_nets

In [None]:
# print(getOverlappingDateRange(ds_final, REF, TARGET_STNS))

## For each event, create column for reference traveltime residual

In [None]:
# Create column for entire table first
ds_final['ttResidualRef'] = np.nan

In [None]:
ref_duped = []
pbar = tqdm(total=len(ds_final), ascii=True)
for eventid, grp in ds_final.groupby('#eventID'):
    pbar.update(len(grp))
    ref_mask = (grp['net'] == REF['net'][0]) & (grp['sta'] == REF['sta'][0]) # Assumes a single reference station.
    grp_ref = grp[ref_mask]
    if grp_ref.empty:
        continue
    # Choose most favourable channel
    cha = None
    available_cha = grp_ref['cha'].values
    for c in channel_pref:
        if c in available_cha:
            cha = c
            break
    # We must find a channel
    if cha is None:
        print("WARNING: Channels {} are not amongst allowed channels {}".format(available_cha, channel_pref))
        continue
    cha_mask = (grp_ref['cha'] == cha)
    grp_cha = grp_ref[cha_mask]
    tt_ref_series = grp_cha['ttResidual'].unique()
    if len(tt_ref_series) > 1:
#         print("WARNING: Multiple reference times found for event {}\n{},"
#               " choosing smallest absolute residual".format(eventid, grp_cha))
        ref_duped.append(grp_ref)
        # In this case, choose the smallest reference tt residual
        grp_cha['absTTResidual'] = np.abs(grp_cha['ttResidual'].values)
        grp_cha = grp_cha.sort_values('absTTResidual')
        tt_ref_series = grp_cha['ttResidual'].unique()
    ref_time = tt_ref_series[0]
    ds_final.loc[grp.index, 'ttResidualRef'] = ref_time
pbar.close()
if ref_duped:
    ref_duped_all = pd.concat(ref_duped)
    ref_duped_all.to_csv("REF_ARRIVAL_DUPES.txt", sep=' ', index=False)

In [None]:
# Quality check - each event should have only one unique reference tt residual
assert np.all([len(df['ttResidualRef'].unique()) == 1 for e, df in ds_final.groupby('#eventID')])

In [None]:
ds_final['relTtResidual'] = ds_final['ttResidual'] - ds_final['ttResidualRef']

In [None]:
# Re-order columns
ds_final = ds_final[['#eventID', 'originTimestamp', 'mag', 'originLon', 'originLat', 'originDepthKm', 'net', 'sta', 'cha', 'pickTimestamp', 'phase',
                     'stationLon', 'stationLat', 'distance', 'snr', 'ttResidual', 'ttResidualRef', 'relTtResidual',
                     'qualityMeasureCWT', 'qualityMeasureSlope', 'nSigma']]

In [None]:
# Sort data by event origin time
ds_final = ds_final.sort_values(['#eventID', 'originTimestamp'])
#display_styled_table(ds_final.iloc[0:50])

In [None]:
ds_time = ds_final.sort_values(['originTimestamp'])
# Remove self-residuals (i.e. residual relative to oneself, which is not useful on the graph)
ds_time = ds_time[ds_time['relTtResidual'] != 0]

# Plotting code

In [None]:
def pandasTimestampToPlottableDatetime(data):
    return data.transform(datetime.datetime.utcfromtimestamp).astype('datetime64[ms]').dt.to_pydatetime()

In [None]:
def add7DMarkerLines():
    for i, x in enumerate(np.array([1.354e9, 1.357e9, 1.362e9, 1.373e9])):
        plt.axvline(x, linestyle='--', linewidth=2, alpha=0.5, color='C'+str(i//2))
    plt.text(1.354e9, 40, 'Drift 1', horizontalalignment='right', fontsize=18, fontstyle='italic', alpha=0.6)
    plt.text(1.373e9, 40, 'Drift 2', horizontalalignment='left', fontsize=18, fontstyle='italic', alpha=0.6)

def addEventMarkerLines():
    time_lims = plt.xlim()
    y_lims = plt.ylim()
    for date, event in significant_events.iterrows():
        event_time = pytz.utc.localize(datetime.datetime.strptime(date, "%Y-%m-%d"))
        if event_time < matplotlib.dates.num2date(time_lims[0]) or event_time >= matplotlib.dates.num2date(time_lims[1]):
            continue
        plt.axvline(event_time, linestyle='--', linewidth=1, color='#00800080')
        plt.text(event_time, y_lims[0] + 0.01*(y_lims[1] - y_lims[0]), event['name'], horizontalalignment='center', verticalalignment='bottom',
                 fontsize=12, fontstyle='italic', color='#008000c0', rotation=90)

### Plotting for standard rel TT residuals chart

In [None]:
def plotTargetNetworkRelResiduals(df, target, ref, tt_scale=60, snr_scale=(0,60), save_file=False, file_label='', annotator=None):
    
    def plotDataset(ds, net_code, ref_code):
        # Sort ds rows by SNR, so that the weakest SNR points are drawn first and the high SNR point last,
        # to make sure high SNR point are in the top rendering layer.
        ds = ds.sort_values('snr')
        times = pandasTimestampToPlottableDatetime(ds['originTimestamp'])
        vals = ds[yaxis].values
        qual = ds['snr'].values
        min_mag = 4.0
        mag = ds['mag'].values - min_mag
        ylabel = 'Relative TT residual (sec)'
        title = "Network {} TT residual relative to {} (filtering: ref SNR$\geq${}, CWT$\geq${}, slope$\geq${}, $n\sigma\geq{}$)".format(
            net_code, ref_code, str(min_ref_snr), str(cwt_cutoff), str(slope_cutoff), str(nsigma_cutoff))
        if len(vals) > 0:
            plt.figure(figsize=(32,9))
            sc = plt.scatter(times, vals, c=qual, alpha=0.5, cmap='gnuplot_r', s=np.maximum(50*mag, 10), edgecolors=None, linewidths=0)
            time_formatter = matplotlib.dates.DateFormatter("%Y-%m-%d")
            plt.axes().xaxis.set_major_formatter(time_formatter)
            cb = plt.colorbar(sc, drawedges=False)
            cb.set_label('Signal to noise ratio', fontsize=12)
            plt.grid(color='#808080', linestyle=':', alpha=0.5)
            plt.xlabel(xlabel, fontsize=14)
            plt.ylabel(ylabel, fontsize=14)
            plt.xticks(fontsize=14)
            plt.yticks(fontsize=14)
            plt.xlim(time_range)
            plt.ylim((-tt_scale, tt_scale))
            plt.clim(snr_scale)
            plt.title(title, fontsize=18)
            lgd = plt.legend(['Point size = Mag - {}, Color = SNR'.format(min_mag)], fontsize=12, loc=1)
            plt.text(0.01, 0.96, "Channel selection: {}".format(channel_pref), transform=plt.gca().transAxes, fontsize=12)
            plt.text(0.01, 0.92, "Start date: {}".format(str(time_range[0])), transform=plt.gca().transAxes, fontsize=12)
            plt.text(0.01, 0.88, "  End date: {}".format(str(time_range[1])), transform=plt.gca().transAxes, fontsize=12)
            if annotator is not None:
                annotator()
            if save_file:
                subfolder = os.path.join(net, ref_code)
                os.makedirs(subfolder, exist_ok=True)
                plt_file = os.path.join(subfolder, stn_code.replace("*", "ALL") + '_' + ylabel.replace(" ", "").replace(".*", "") + file_label + ".png")
                plt.savefig(plt_file, dpi=150)
    # end plotDataset
    
    df_times = pandasTimestampToPlottableDatetime(df['originTimestamp'])
    time_range = (df_times.min(), df_times.max())
    print(" to ".join([t.strftime("%Y-%m-%d %H:%M:%S") for t in time_range]))
    yaxis='relTtResidual'
    ref_code = ".".join([ref['net'][0], ref['sta'][0]])
    xlabel = 'Event Origin Timestamp'
#     for i, stn in enumerate(target['sta']):
#         net = target['net'][i]
#         df_sample = df.loc[(df['net'] == net) & (df['sta'] == stn), ['#eventID', 'originTimestamp', 'mag', 'net', 'sta', yaxis, 'snr',\
#                                                                      'qualityMeasureCWT', 'qualityMeasureSlope', 'nSigma']]
#         plotDataset(df_sample, net, stn, ref_code)
        
    if True:
        # Remove reference station from target set before producing composite image.
        # The reference station may not be there, but remove it if it is.
        mask_ref = df[list(ref)].isin(ref).all(axis=1)
        mask_targ = df[list(target)].isin(target).all(axis=1)
        df_agg = df[(mask_targ) & (~mask_ref)]
        plotDataset(df_agg, ','.join(np.unique(target['net'])), ref_code)

In [None]:
# display_styled_table(ds_final[0:100])

In [None]:
# plotTargetNetworkRelResiduals(ds_final, TARGET_STNS, REF, save_file=False, plot_aggregate=True)

plotTargetNetworkRelResiduals(ds_final, TARGET_STNS, REF, save_file=False, annotator=addEventMarkerLines)

# plotTargetNetworkRelResiduals(ds_final, TARGET_STNS, REF, save_file=False, plot_aggregate=False, annotator=add7DMarkerLines)

# plotTargetNetworkRelResiduals(ds_final[(ds_final['originTimestamp'] >= 1.35e9) & (ds_final['originTimestamp'] <= 1.38e9)],
#                               TARGET_STNS, REF, save_file=False, plot_aggregate=False, annotator=add7DMarkerLines, file_label='(zoomed)')

In [None]:
# Repeat, focusing on only results with high SNR
# df_high_snr = ds_final[ds_final['snr'] >= 10.0]
# plotTargetNetworkRelResiduals(df_high_snr, TARGET_STNS, REF, file_label='(high_SNR)', timeaxis=True)

In [None]:
# Remove REF_STN from dataset used for this plot.
mask_targ_snr_plot = ds_final[list(TARGET_STNS)].isin(TARGET_STNS).all(axis=1)
ds_snr_plot = ds_final[mask_targ_snr_plot]

plt.figure(figsize=(32,9))

COLOR_BY_DATE=True
if COLOR_BY_DATE:
    # Color by date
    time_series = ds_snr_plot['originTimestamp']
    sc = plt.scatter(ds_snr_plot['relTtResidual'], ds_snr_plot['snr'], c=time_series.values, alpha=0.5, s=30*(ds_snr_plot['mag'] - 4))
    time_formatter = matplotlib.dates.DateFormatter("%Y-%m-%d")
    cb = plt.colorbar(sc)
    cb.set_label("Event Origin Timestamp")
    cb_yticks = cb.ax.get_yticks()
    cb_ytick_labels = [datetime.datetime.utcfromtimestamp(date_tick).strftime("%Y-%m-%d") for date_tick in cb_yticks]
    cb.set_ticks(cb_yticks)
    cb.set_ticklabels(cb_ytick_labels)
    plt.legend(['Point size = Mag - 4.0, Color = Event Timestamp'], fontsize=12)
else:
    # Color by station code
    stations = sorted(list(set(ds_snr_plot['sta'].unique())))
    # print(stations)
    colors = ['C' + str(n%10) for n in range(len(stations))]
    # print(colors)
    cdict = dict(zip(stations, colors))
    cdict[REF_STN] = '#808080ff'
    plt.scatter(ds_snr_plot['relTtResidual'], ds_snr_plot['snr'], c=ds_snr_plot['sta'].apply(lambda x: cdict[x]).values, alpha=0.5, s=20*(ds_snr_plot['mag'] - 4))
    plt.legend(['Point size = Mag - 4.0, Color = station ID'], fontsize=12)

plt.grid(color='#80808080', linestyle=':')
plt.ylim((0,100))
plt.xlim((-55,55))
plt.xlabel("Relative TT residual (sec)")
plt.ylabel('SNR')
plt.title("SNR vs relative TT residual across target network {}".format(TARGET_NET))
plt.savefig('SNR_vs_relTtResidual_' + TARGET_NET + '_' + ".".join([REF_NET, REF_STN]) + '.png', dpi=150)

## Expose re-use of station codes 7D and 7G 

In [None]:
df_raw_7D = df_raw_picks.loc[(df_raw_picks['net'] == '7D')]
df_raw_7G = df_raw_picks.loc[(df_raw_picks['net'] == '7G')]

In [None]:
dates_7D = pandasTimestampToPlottableDatetime(df_raw_7D['originTimestamp'])
dates_7G = pandasTimestampToPlottableDatetime(df_raw_7G['originTimestamp'])

In [None]:
dates_7D.sort()
dates_7G.sort()

In [None]:
dates_7D[0]

In [None]:
dates_7G[0]

In [None]:
df_dates_7D = pd.DataFrame(dates_7D, columns=['timestamp'])
df_dates_7G = pd.DataFrame(dates_7G, columns=['timestamp'])

In [None]:
df_dates_7D['year'] = df_dates_7D['timestamp'].apply(lambda x: int(x.year))
df_dates_7G['year'] = df_dates_7G['timestamp'].apply(lambda x: int(x.year))

In [None]:
event_count_7D = {y: len(g) for y, g in df_dates_7D.groupby('year')}
event_count_7G = {y: len(g) for y, g in df_dates_7G.groupby('year')}

In [None]:
event_count_7D

In [None]:
event_count_7G

In [None]:
x_7D, y_7D = (event_count_7D.keys(), event_count_7D.values())
x_7G, y_7G = (event_count_7G.keys(), event_count_7G.values())

In [None]:
date_mask_7D = (df_raw_7D['originTimestamp'] < pd.Timestamp(datetime.datetime(2010, 1, 1)).timestamp())
early_7D_stns = df_raw_7D.loc[date_mask_7D, 'sta'].unique()
late_7D_stns = df_raw_7D.loc[~date_mask_7D, 'sta'].unique()

In [None]:
date_mask_7G = (df_raw_7G['originTimestamp'] < pd.Timestamp(datetime.datetime(2010, 1, 1)).timestamp())
early_7G_stns = df_raw_7G.loc[date_mask_7G, 'sta'].unique()
late_7G_stns = df_raw_7G.loc[~date_mask_7G, 'sta'].unique()

In [None]:
print(early_7D_stns)
print(late_7D_stns)
print(early_7G_stns)
print(late_7G_stns)

In [None]:
plt.bar(x_7D, y_7D)
plt.xlabel('Year', fontsize=16)
plt.xticks(list(range(min(x_7D), max(x_7D)+1, 1)), fontsize=14)
plt.ylabel('Pick count', fontsize=16)
plt.yticks(fontsize=14)
plt.grid(':', color="#a0a0a080")
plt.title('Years of pick records in {} for network {}'.format('ensemble.p.txt', '7D'), fontsize=20)
plt.text(1997, event_count_7D[1997] + 200, str(early_7D_stns), fontsize=12)
plt.text(2013 - 1, event_count_7D[2013] + 200, str(late_7D_stns), horizontalalignment='right', verticalalignment='top', fontsize=12)
plt.savefig('record_years_7D.png', dpi=300)
None

In [None]:
plt.bar(x_7G, y_7G)
plt.xlabel('Year', fontsize=16)
plt.xticks(list(range(min(x_7G), max(x_7G)+1, 1)), fontsize=14)
plt.ylabel('Pick count', fontsize=16)
plt.yticks(fontsize=14)
plt.grid(':', color="#a0a0a080")
plt.title('Years of pick records in {} for network {}'.format('ensemble.p.txt', '7G'), fontsize=20)
plt.text(2000, event_count_7G[2000] + 200, str(early_7G_stns), fontsize=12)
plt.text(2015 - 1, event_count_7G[2015] + 200, str(late_7G_stns), horizontalalignment='right', verticalalignment='top', fontsize=12)
plt.savefig('record_years_7G.png', dpi=300)
None

In [None]:
# # Find extreme event for KDU
# # mask = (ds_final['sta'] == 'KDU') & (ds_final['snr'] > 40) & (ds_final['relTtResidual'] < -25)
# mask = (ds_final['sta'] == 'KDU') & (ds_final['snr'] > 40) & (ds_final['relTtResidual'] < -25) & (ds_final['mag'] > 5.5)
# outlier = ds_final[mask]
# outlier

In [None]:
# event_id = outlier['#eventID'].values[0]
# event_id

In [None]:
# df_event = ds_final[ds_final['#eventID'] == event_id]
# display_styled_table(df_event)

In [None]:
# pd.options.display.float_format = '{:.1f}'.format
# print(df_event[['#eventID', 'originTimestamp', 'mag', 'originLon', 'originLat', 'originDepthKm', 'net', 'sta', 'cha', 
#                 'pickTimestamp', 'phase', 'stationLon', 'stationLat', 'distance', 'ttResidual', 'relTtResidual', 'snr']])

In [None]:
# obspy.UTCDateTime(1299832266.4)

In [None]:
# getNetworkMean(df_picks, '7D')

In [None]:
df_raw_picks.loc[(df_raw_picks['mag'] > 0), 'mag'].min()

In [None]:
df_picks[(df_raw_picks['net'] == 'AU') & (df_raw_picks['sta'] == 'KDU')]['originTimestamp'].min()

In [None]:
float(obspy.UTCDateTime('2012-07-01'))

In [None]:
obspy.UTCDateTime(773377184.866)

In [None]:
sorted(list(df_raw_picks.loc[(df_raw_picks['net'] == '7D') & (df_raw_picks['originTimestamp'] < 1341100800.0)]['sta'].unique()))