In [1]:
#general libraries
import os
import sys
import glob
import pathlib
import warnings
import timeit
import math
from tqdm import tqdm
#regular expressions
import re
#database libraries
from sqlite3 import connect
#arithmetic libraries
import numpy as np
import scipy
#statistics libraries
import pandas as pd
#ground motion models
import pygmm
#ipython
from IPython.display import display, clear_output

#gravity
grav = 9.81

#output directories
dir_out = '/resnick/groups/enceladus/glavrent/Scalable_GPs/Data/preprocessing_jz/'
dir_scec_extracted = '/resnick/groups/enceladus/glavrent/Scalable_GPs/Data/preprocessing_jz/data_perSite/'
raw_data_dir = '/resnick/groups/enceladus/glavrent/Scalable_GPs/Raw_files/scec/'
dir_fig = dir_out + 'figures/'
#ground motion database
fn_gm_db = 'gm_db.sqlite'

gmm_dict = {'ASK14':pygmm.AbrahamsonSilvaKamai2014, 'CY14':pygmm.ChiouYoungs2014}

if os.path.exists(dir_out+fn_gm_db):
    #create database connection for ground motion data
    db_gm_cnx = connect(dir_out+fn_gm_db)
    db_gm_cur = db_gm_cnx.cursor()
    # load station metadata
    query = "SELECT * FROM metadata_sta;"
    df_sta_metadata = pd.read_sql_query(query, db_gm_cnx)
    print('Loaded station metadata from database.')
else:
    print(f'ERROR: {dir_out+fn_gm_db} does not exist. Please create the database first.')

# Periods to process
per2process = [2.0,2.2]     #batch 1

print(df_sta_metadata.shape)

fn_metadata = '/resnick/groups/enceladus/glavrent/Scalable_GPs/Data/preprocessing/gm_metadata.csv'
df_metadata = pd.read_csv(fn_metadata)
#filter bad sources
df_metadata = df_metadata.loc[~np.isin(df_metadata.source_id, [1, 13, 41, 188]),:]
df_metadata.reset_index(drop=True, inplace=True)

df_metadata = df_metadata.set_index('scen_id')
print(df_metadata.shape)

Loaded station metadata from database.
(335, 11)
(2550245, 34)


In [2]:
site_to_test = df_sta_metadata.site_id.values[0]  # from 0 to 334
per_to_test = per2process[0]  # from 0 to 1

In [3]:
## Test if the median is correctly calculated
# load the calculated median data
# Load GMM median data
df_gmm_data = {}
for gmm_key in tqdm(gmm_dict, total = len(gmm_dict), desc='Loading GMM data'):
    df_gmm_data_i = pd.read_pickle(dir_out + f'df_gmm_{gmm_key}.pkl')
    df_gmm_data[gmm_key] = df_gmm_data_i.reset_index()
# Check if the GMM data is correctly calculated
for gmm_key in df_gmm_data:
    df_gm = df_metadata[df_metadata.site_id == site_to_test]
    df_gmm_data_i = df_gmm_data[gmm_key][df_gmm_data[gmm_key].site_id == site_to_test]
    df_gmm_data_i = df_gmm_data_i.set_index(['source_id', 'rupture_id'])
    if len(df_gm) != len(df_gmm_data_i):
        print(f'ERROR: GMM data for {gmm_key} does not match the metadata for site {site_to_test}.')
        break
    for i in tqdm(range(len(df_gm)), total=len(df_gm), desc=f'Checking GMM data for {gmm_key}'):
        gm = df_gm.iloc[i].to_dict()
        s = pygmm.model.Scenario(mag=gm['mag'], dip=gm['dip'], mechanism='SS',
                                depth_tor=gm['z_tor'],  width=gm['width'],
                                dist_rup=gm['Rrup'], dist_jb=gm['Rjb'],
                                dist_x=gm['Rx'], dist_y0=gm['Ry0'],
                                v_s30=gm['vs30_scec'], depth_1_0=gm['z1.0'])
        true_median = gmm_dict[gmm_key](s).interp_spec_accels([per_to_test])[0]
        src_id = gm['source_id']
        rup_id = gm['rupture_id']
        result_value = df_gmm_data_i.loc[(src_id, rup_id), f'gmm_T{per_to_test:.2f}s']
        # result_value = df_gmm_data_i[(df_gmm_data_i['source_id'] == src_id) & (df_gmm_data_i['rupture_id'] == rup_id)][f'gmm_T{per_to_test:.2f}s'].values[0]
        if not np.isclose(true_median, result_value, rtol=1e-5, atol=1e-5):
            print(f'ERROR: GMM data for {gmm_key} does not match the metadata for site {site_to_test} at period {per_to_test}.')
            print(f'True median: {true_median}, Calculated median: {result_value}')
            break


Loading GMM data:   0%|          | 0/2 [00:00<?, ?it/s]

Loading GMM data: 100%|██████████| 2/2 [00:02<00:00,  1.48s/it]
Checking GMM data for ASK14: 100%|██████████| 7581/7581 [00:07<00:00, 953.26it/s]
Checking GMM data for CY14: 100%|██████████| 7581/7581 [00:04<00:00, 1878.41it/s]


In [4]:
del df_gmm_data

In [5]:
## Test if the scec data is correctly extracted
# load raw SCEC data
fn_scec = f'study_22_12_{per_to_test:.1f}sec.sqlite'
fn_scec_path = os.path.join(raw_data_dir, fn_scec)
db_scec_raw_cnx = connect(fn_scec_path)
query = f"SELECT * FROM IM_Data WHERE Site_Name = '{site_to_test}';"
print(f"Loading SCEC raw data for site {site_to_test} and period {per_to_test}...")
df_scec = pd.read_sql_query(query, db_scec_raw_cnx)
db_scec_raw_cnx.close()
df_scec = df_scec.loc[~np.isin(df_scec.Source_ID, [1, 13, 41, 188]),:]

# load precessed data
query = f"SELECT * FROM data_sa_scec_{site_to_test} WHERE period = {per_to_test};"
print(f"Loading processed data for site {site_to_test} and period {per_to_test}...")
df_scec_processed = pd.read_sql_query(query, db_gm_cnx)

if len(df_scec) != len(df_scec_processed):
    print(f'ERROR: SCEC processed data for site {site_to_test} does not match the raw data.')

df_scec = df_scec.set_index(['Source_ID', 'Rupture_ID', 'Rup_Var_ID'])

Loading SCEC raw data for site ACTN and period 2.0...
Loading processed data for site ACTN and period 2.0...


In [None]:
df_scec_processed = df_scec_processed.merge(df_metadata[['source_id', 'rupture_id']], left_on='scen_id', right_index = True, how='left')
df_compare = df_scec_processed.merge(df_scec, left_on=['source_id', 'rupture_id', 'rup_var_id'],
                                      right_index=True, suffixes=('_processed', '_raw'))
if not np.isclose(df_compare['IM_Value'] *0.01 / grav, df_compare['psa']).all():
    print("Discrepancy found between processed and raw SCEC data.")
else:
    print(f"Processed SCEC data matches the raw data for site {site_to_test} and period {per_to_test}.")

In [7]:
del df_scec

In [8]:
# Test if the res + median is scec
# load the calculated res data
df_res_data = {}
for gmm_key in tqdm(gmm_dict, total = len(gmm_dict), desc='Loading RES data'):
    df_gmm_data_i = pd.read_pickle(dir_out + f'df_gmm_{gmm_key}.pkl')
    df_gmm_data_i = df_gmm_data_i.reset_index()
    query = f"SELECT * FROM data_sa_res_{site_to_test} WHERE period = {per_to_test} AND gmm = '{gmm_key}';"
    print(f"Loading processed data for site {site_to_test} and period {per_to_test}...")
    df_res_data_i = pd.read_sql_query(query, db_gm_cnx)
    df_res_data[gmm_key] = df_res_data_i.merge(df_gmm_data_i[['scen_id', f'gmm_T{per_to_test:.2f}s']],
                                               left_on='scen_id', right_on='scen_id', how='left')

Loading RES data:   0%|          | 0/2 [00:00<?, ?it/s]

Loading processed data for site ACTN and period 2.0...


Loading RES data:  50%|█████     | 1/2 [00:21<00:21, 21.83s/it]

Loading processed data for site ACTN and period 2.0...


Loading RES data: 100%|██████████| 2/2 [00:43<00:00, 21.57s/it]


In [19]:
all_pass = True
for gmm_key in tqdm(gmm_dict, total = len(gmm_dict), desc='Test RES'):
    test_df = df_res_data[gmm_key].merge(df_scec_processed[['scen_id', 'psa', 'rup_var_id']],
                           left_on=['scen_id', 'rup_var_id'],
                           right_on=['scen_id', 'rup_var_id'], how='left')
    if not np.isclose((test_df['res'].values + np.log(test_df[f'gmm_T{per_to_test:.2f}s'].values)),
           np.log(test_df['psa'])).all():
        print(f'ERROR: RES data for {gmm_key} does not match the SCEC data for site {site_to_test} at period {per_to_test}.')
        all_pass = False
if all_pass:
    print(f'All tests passed for site {site_to_test} at period {per_to_test}.')

Test RES: 100%|██████████| 2/2 [00:00<00:00, 14.63it/s]

All tests passed for site ACTN at period 2.0.



