# Convert dvv traces into the csv file
2023.04.23 Kurama Okubo

This notebook gather the dvv data into the csv file to plot the master figures associated with the dvv time history.

**NOTE:** The csv is output in the multi-column format with pandas. Please read the csv with specifying the header and index such as `pd.read_csv("../plotcsv_masterdata/dvvdata_all_stretching.csv", header=[0, 1, 2, 3], index_col=0)`.

In [18]:
import datetime
import os

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import matplotlib.dates as mdates
%matplotlib inline

import numpy as np
import pandas as pd
import h5py

import shutil
from tqdm import tqdm

import matplotlib as mpl

os.environ['TZ'] = 'GMT' # change time zone to avoid confusion in unix_tvec conversion

plt.rcParams["font.family"] = 'Arial'
# plt.rcParams["font.sans-serif"] = "DejaVu Sans, Arial, Helvetica, Lucida Grande, Verdana, Geneva, Lucid, Avant Garde, sans-serif"
plt.rcParams["font.size"] = 12
plt.rcParams["xtick.direction"] = "in"
plt.rcParams["xtick.major.size"] = 5
plt.rcParams["xtick.major.width"] = 0.5
plt.rcParams["xtick.minor.size"] = 2
plt.rcParams["xtick.minor.width"] = 1
plt.rcParams["xtick.minor.visible"] = True

plt.rcParams["ytick.direction"] = "in"
plt.rcParams["ytick.major.size"] = 5
plt.rcParams["ytick.major.width"] = 0.5
plt.rcParams["ytick.minor.size"] = 2
plt.rcParams["ytick.minor.width"] = 1
plt.rcParams["ytick.minor.visible"] = True

In [19]:
root_csv = "../data"
csv_stats_list = [root_csv + "/monitoring_stats_uwbackup_2010-2022_stretching.csv",
                              root_csv + "/monitoring_stats_uwbackup_2010-2022_mwcs.csv"]

In [20]:
# To validate the meta data like time vector, we load the channel-stacked h5 data.
root_h5 = "../processed_data/"
h5_stats_list = [root_h5+"/dvvtraces_chanweighted_monitoring_stats_uwbackup_2010-2022_stretching.csv_0.9-1.2.h5",
                       root_h5+"/dvvtraces_chanweighted_monitoring_stats_uwbackup_2010-2022_mwcs.csv_0.9-1.2.h5"]


In [21]:
#---set the file path of your case study list---#
csv_stats_id = 1 # 0: stretching 1:mwcs

starttime = datetime.datetime(2002, 1, 1)
endtime = datetime.datetime(2022, 6, 1)

cc_time_unit=86400 # short-stacking time unit
averagestack_factor=30 # length of time bin to compute mean and std
averagestack_step=15

output_datadir = "../plotcsv_masterdata"
#-------------------------------------------#

if not os.path.exists(output_datadir):
    os.makedirs(output_datadir)

In [22]:
csv_stats = csv_stats_list[csv_stats_id]
dvvmethod = os.path.basename(csv_stats).split(".csv")[0].split("_")[-1]

In [23]:
# make uniform time vector
st_center = (averagestack_factor*cc_time_unit/86400)/2
date_range_st = starttime + datetime.timedelta(days=st_center) # day
datefreq = '%dD'%(averagestack_step*cc_time_unit/86400)
uniformdates = pd.date_range(start=date_range_st, end=endtime, freq=datefreq)
uniform_tvec_date = uniformdates.date
uniform_tvec = [datetime.datetime.fromordinal(x.toordinal()) for x in uniform_tvec_date] # convert from date to datetime

In [24]:
uniform_timestamp = np.array([datetime.datetime.timestamp(x) for x in uniform_tvec])

In [25]:
# assert if the uniform_tvec is identical to the .h5 data
h5_id = csv_stats_id # use same id as csv stats.
fi = h5py.File(h5_stats_list[h5_id], "r")
uniform_tvec_h5 = np.array(fi['uniform_tvec'])
assert (uniform_timestamp == uniform_tvec_h5).all()

In [26]:
df_origin=pd.read_csv(csv_stats, dtype=str, comment='#')


In [27]:
df_origin.head()

Unnamed: 0,date,stationpair,networks,components,freqband,dvv_mwcs,dvv0_mwcs,dvv_err_mwcs,dvv0_err_mwcs
0,2002-05-16T00:00:00.0,BP.CCRB-BP.CCRB,BP-BP,11,0.2-0.5,-0.0042021411117799,-0.0042021415930146,1.6032154558632516e-05,1.6032122251148442e-05
1,2002-05-16T00:00:00.0,BP.CCRB-BP.CCRB,BP-BP,11,0.5-0.9,0.0004750020021774,0.0004782995489394,3.0225200979495056e-07,3.015015493023752e-07
2,2002-05-16T00:00:00.0,BP.CCRB-BP.CCRB,BP-BP,11,0.9-1.2,-0.0002544209653618,-0.0004092988735225,1.1636108854185346e-07,9.66428648911806e-08
3,2002-05-16T00:00:00.0,BP.CCRB-BP.CCRB,BP-BP,11,1.2-2.0,0.0006919832045516,0.0005558658183633,5.89138998774463e-06,5.8684596503249385e-06
4,2002-05-31T00:00:00.0,BP.CCRB-BP.CCRB,BP-BP,11,0.2-0.5,-3.606589072153216e-05,-3.2780282152109065e-05,3.1650222320287525e-06,3.1606102277351435e-06


In [28]:
#scan frequency band
freqbands = np.unique(df_origin.freqband) #freqid is corresponding to the index of this band
freqbands

array(['0.2-0.5', '0.5-0.9', '0.9-1.2', '1.2-2.0'], dtype=object)

In [29]:
# set dtype
df_origin['t'] = pd.to_datetime(df_origin['date'], format="%Y-%m-%dT%H:%M:%S.%f").view(int)/1e9

if dvvmethod=="stretching":
    df_origin['cc_dvv'] = df_origin['cc_ts'].astype(float)
    df_origin['dvv'] = df_origin['dvv_ts'].astype(float)
    df_origin['err'] = df_origin['err_ts'].astype(float)

elif dvvmethod=="mwcs":
    """
    We chose dvv_mwcs here, but you can also use dvv0_mwcs, which impose crossing at zero.
    """
    df_origin['dvv'] = (-1) * 100 * df_origin['dvv_mwcs'].astype(float) # convert the unit to [%]
    df_origin['err'] =  100 * df_origin['dvv_err_mwcs'].astype(float)
    df_origin['cc_dvv'] = np.zeros_like(df_origin['dvv'])
    df_origin['cc_dvv'].replace(0, np.nan, inplace=True)
# elif dvvmethod=="codaQ":
#     df_origin['cc_dvv'] = df_origin['cc_dvv'].astype(float)
#     df_origin['dvv'] = df_origin['dvv'].astype(float)

In [30]:
pd.to_datetime(df_origin["t"], unit='s').iloc[0]

Timestamp('2002-05-16 00:00:00')

In [31]:
df_origin.head()

Unnamed: 0,date,stationpair,networks,components,freqband,dvv_mwcs,dvv0_mwcs,dvv_err_mwcs,dvv0_err_mwcs,t,dvv,err,cc_dvv
0,2002-05-16T00:00:00.0,BP.CCRB-BP.CCRB,BP-BP,11,0.2-0.5,-0.0042021411117799,-0.0042021415930146,1.6032154558632516e-05,1.6032122251148442e-05,1021507000.0,0.420214,0.001603,
1,2002-05-16T00:00:00.0,BP.CCRB-BP.CCRB,BP-BP,11,0.5-0.9,0.0004750020021774,0.0004782995489394,3.0225200979495056e-07,3.015015493023752e-07,1021507000.0,-0.0475,3e-05,
2,2002-05-16T00:00:00.0,BP.CCRB-BP.CCRB,BP-BP,11,0.9-1.2,-0.0002544209653618,-0.0004092988735225,1.1636108854185346e-07,9.66428648911806e-08,1021507000.0,0.025442,1.2e-05,
3,2002-05-16T00:00:00.0,BP.CCRB-BP.CCRB,BP-BP,11,1.2-2.0,0.0006919832045516,0.0005558658183633,5.89138998774463e-06,5.8684596503249385e-06,1021507000.0,-0.069198,0.000589,
4,2002-05-31T00:00:00.0,BP.CCRB-BP.CCRB,BP-BP,11,0.2-0.5,-3.606589072153216e-05,-3.2780282152109065e-05,3.1650222320287525e-06,3.1606102277351435e-06,1022803000.0,0.003607,0.000317,


# Rearrange the csv with time

To improve the ease of processing the time history of dv/v, we reshape the dataframe by the time vectors.

In [32]:
df_pivoted = df_origin.pivot(index='t', columns=['freqband', 'stationpair', "components"], values=['dvv', 'cc_dvv', 'err'])

In [33]:
# Write the csv to the csv file
foname = output_datadir+f"/dvvdata_all_{dvvmethod}.csv"
df_pivoted.to_csv(foname)

In [34]:
# read master csv
# df = pd.read_csv(foname, header=[0, 1, 2, 3], index_col=0)
