# Archive in Sciencebase


In [2]:
from loguru import logger
from pathlib import Path
import pandas as pd
import geopandas as gpd

from mth5.mth5 import MTH5
from mth5.clients.zen import ZenClient
from mth5 import read_file

from mt_metadata.timeseries import Survey, Station, Run, Electric, Magnetic

from mtpy import MT

from archive.mt_xml import MTSBXML

## Set paths

Set the data directory, here I have all stations under one folder.

In [20]:
data_directory_2021 = Path(r"c:\Users\jpeacock\OneDrive - DOI\MTData\GZ2021")
data_directory_2022 = Path(r"c:\Users\jpeacock\OneDrive - DOI\MTData\GZ2022")
data_directory_2023 = Path(r"c:\Users\jpeacock\OneDrive - DOI\MTData\GZ2023")

# Calibration path
calibration_path = Path(r"c:\Users\jpeacock\OneDrive - DOI\MTData\antenna_20190411.cal")

# processed transfer functions
edi_path_2021 = Path(
    r"c:\Users\jpeacock\OneDrive - DOI\MTData\transfer_function_archive\CEC_Geysers_2021"
)
edi_path_2022 = Path(
    r"c:\Users\jpeacock\OneDrive - DOI\MTData\transfer_function_archive\CEC_Geysers_2022"
)
edi_path_2023 = Path(
    r"c:\Users\jpeacock\OneDrive - DOI\MTData\transfer_function_archive\CEC_Geysers_2023"
)

# dictionary to key off of for directory paths
path_dict = {
    2021: {
        "data": data_directory_2021,
        "edi": edi_path_2021,
        "df": pd.read_csv(data_directory_2021.joinpath("survey_summary.csv")),
    },
    2022: {
        "data": data_directory_2022,
        "edi": edi_path_2022,
        "df": pd.read_csv(data_directory_2022.joinpath("survey_summary.csv")),
    },
    2023: {
        "data": data_directory_2023,
        "edi": edi_path_2023,
        "df": pd.read_csv(data_directory_2023.joinpath("survey_summary.csv")),
    },
}

# archive path
archive_path = data_directory_2021.parent.joinpath("archive")
archive_path.mkdir(exist_ok=True)

# survey ID
survey_id = "CEC_Geysers"

In [16]:
def change_station_name(name, prefix: str=""):
    """
    change station name to be gz{year}{location}
    :param name: DESCRIPTION
    :type name: TYPE
    :return: DESCRIPTION
    :rtype: TYPE

    """
    if name.lower().startswith("gz3"):
        return name.replace("gz3", f"gz{prefix}")
    elif name.startswith("3"):
        return name.replace("3", f"gz{prefix}", 1)
    elif name.lower().startswith("gz2"):
        st_number = int(name.replace("gz", "")) - 200 + 50
        return f"gz{prefix}{st_number}"
    elif name.startswith("2"):
        st_number = int(name) - 200 + 50
        return f"gz{prefix}{st_number}"

In [17]:
df = pd.read_csv(r"c:\Users\jpeacock\OneDrive - DOI\MTData\archive\cec_survey_summaries.csv")
for row in df.itertuples():
    df.loc[row.Index, "station"] = change_station_name(row.station, prefix=f"{row.survey[-2:]}")
df.to_csv(r"c:\Users\jpeacock\OneDrive - DOI\MTData\archive\cec_survey_summaries_renamed.csv", index=False)


In [30]:
for row in df.itertuples():
    df.loc[row.Index, "survey"] = row.survey.replace('GZ', "CEC_Geysers_")
df.to_csv(r"c:\Users\jpeacock\OneDrive - DOI\MTData\archive\cec_survey_summaries_renamed.csv", index=False)

In [33]:
df = pd.read_csv(r"c:\Users\jpeacock\OneDrive - DOI\MTData\archive\cec_survey_summaries_renamed.csv")
df["start"] = pd.to_datetime(df.start)
df["end"] = pd.to_datetime(df.end)

## Write shapefile


In [44]:
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs=4326)
gdf = gdf.fillna("None")
gdf[['station', 'survey', 'start', 'end', 'latitude', 'longitude',
       'elevation', 'instrument_id', 'components', 'dipole_ex', 'dipole_ey',
       'hx', 'hy', 'geometry']].to_file(archive_path.joinpath("cec_geysers_mt_stations.shp"))

NameError: name 'np' is not defined

# write metadata XML file

In [26]:
x = MTSBXML()
x.read_template_xml(r"c:\Users\jpeacock\OneDrive - DOI\MTData\CL2021\archive\clearlake_2022_metadata.xml")
x.update_from_config(r"C:\Users\jpeacock\OneDrive - DOI\MTData\archive\cec_geysers_mt_xml_configuration.cfg")
x.update_bounding_box(df.longitude.max(), df.longitude.min(), df.latitude.max(), df.latitude.min())
x.update_time_period(df.start.min().isoformat(), df.end.max().isoformat())
x.update_shp_attributes(df)
x.update_metadate()
x.save(archive_path.joinpath("cec_geysers_repeat_mt_metadata.xml"))


In [27]:
df.survey.unique()

array(['GZ2021', 'GZ2022', 'GZ2023'], dtype=object)

## Survey Metadata

In [3]:
survey_metadata = Survey()
survey_metadata.acquired_by.author = (
    "Jared Peacock and Mike Mitchell (U.S. Geological Survey)"
)
survey_metadata.citation_dataset.doi = r"https://doi.org/10.5066/P14BJG2A"
survey_metadata.country = "USA"
survey_metadata.datum = "WGS84"
survey_metadata.funding_source.comments = (
    "Project Lead is David Alumbaugh of Lawrence Berkeley National Labs"
)
survey_metadata.funding_source.email = "dlalumbaugh@lbl.gov"
survey_metadata.funding_source.grant_id = "EPC-19-019"
survey_metadata.funding_source.name = "California Energy Commission"
survey_metadata.funding_source.organization = "California Energy Commission"
survey_metadata.funding_source.url = "https://www.energy.ca.gov/"
survey_metadata.geographic_name = "The Geysers, northern California"
survey_metadata.name = "MT Monitoring of the Geysers Geothermal Field"
survey_metadata.project = "MT Monitoring of the Geysers Geothermal Field"
survey_metadata.project_lead.author = "Jared Peacock"
survey_metadata.project_lead.email = "jpeacock@usgs.gov"
survey_metadata.project_lead.organization = "U.S. Geological Survey"
survey_metadata.release_license = "CC-BY-4.0"
survey_metadata.summary = (
    "The project is funded by the California Energy Commission to monitor The Geysers"
    "Geothermal field with repeat MT surveys and continuous passive seismic between "
    "2021-2023.  The end product is a 4D joint inversion of the seismic and MT data."
    "Summaries can be found at:\n\t"
    "Peacock, J. R., Alumbaugh, D., Mitchell, M. A., "
    "Hartline, C. (2022) Repeat Magnetotelluric Measurements to Monitor the Geysers "
    "Steam Field in Northern California, Proceedings 47th Workshop on Geothermal "
    "Reservoir Engineering, Stanford, California, "
    "https://pangea.stanford.edu/ERE/db/IGAstandard/record_detail.php?id=35437."
    ""
    "\n\t"
    "Peacock, J. R., Alumbaugh, D., Mitchell, M. A., "
    "Hartline, C. (2023) Repeated Magnetotelluric Measurements at the Geysers, California "
    "Proceedings 48th Workshop on Geothermal "
    "Reservoir Engineering, Stanford, California, "
    "https://pangea.stanford.edu/ERE/db/IGAstandard/record_detail.php?id=35652."
    "\n\t"
    "Peacock, J. R., Alumbaugh, D., Mitchell, M. A., "
    "Hartline, C. (2024) Summary of Annual Repeat Magnetotelluric Surveys of the "
    "Geysers Geothermal Field Proceedings 49th Workshop on Geothermal "
    "Reservoir Engineering, Stanford, California, "
    "https://pangea.stanford.edu/ERE/db/IGAstandard/record_detail.php?id=36417."
    "\n\t"
    "Um, E. S., Commer, M., Gritto, R., Peacock, J. R., Alumbaugh, D. L.,  "
    "Jarpe, S. P., and Hartline, C., (2023), Cooperative joint inversion of "
    "magnetotelluric and microseismic data for imaging The Geysers geothermal "
    "field, California, USA, GEOPHYSICS 88: WB45-WB54. "
    "https://doi.org/10.1190/geo2022-0521.1."
)
survey_metadata.northwest_corner.latitude = 38.867
survey_metadata.northwest_corner.longitude = -127.717
survey_metadata.southeast_corner.latitude = 38.761
survey_metadata.southeast_corner.longitude = -122.887
survey_metadata.time_period.start = "2021-04-05T18:19:58+00:00"
survey_metadata.time_period.end = "2023-05-08T18:00:00+00:00"

## Station Metadata

In [4]:
declination_dict = {2021: 13.46, 2022: 13.37, 2023: 13.39}

'gz35'

In [7]:
station_metadata = Station()
station_metadata.to_dict(single=True, required=False)
station_metadata.acquired_by.author = "Jared Peacock, Mike Mitchell, and David Alumbaugh (LBNL)"
station_metadata.acquired_by.comments = None
station_metadata.acquired_by.organization = "U.S. Geological Survey"
station_metadata.channel_layout = "L"
station_metadata.comments = None
station_metadata.data_type = "BBMT"
station_metadata.location.declination.comments = "from https://ngdc.noaa.gov/geomag/calculators/magcalc.shtml#declination"
station_metadata.location.declination.model = "IGRF"
station_metadata.location.declination.value = 12.5
station_metadata.orientation.method = "compass"
station_metadata.orientation.reference_frame = "geomagnetic"
station_metadata.provenance.comments = "Time series converted from Zen format to MTH5"
station_metadata.provenance.software.author = "Jared Peacock"
station_metadata.provenance.software.name = "MTH5"
station_metadata.provenance.software.version = "0.4.9"
station_metadata.provenance.submitter.author = "Jared Peacock"
station_metadata.provenance.submitter.email = "jpeacock@usgs"
station_metadata.provenance.submitter.organization = "U.S. Geological Survey"
station_metadata.location.state = "California"

## Run Metadata

In [8]:
run_metadata = Run()
run_metadata.data_logger.firmware.author = "Zonge International"
run_metadata.data_logger.firmware.name = "ZEN"
run_metadata.data_logger.firmware.version = "5357"
run_metadata.data_logger.manufacturer = "Zonge International"
run_metadata.data_logger.model = "ZEN"
run_metadata.data_logger.id = "ZEN046"
run_metadata.data_logger.power_source.comments = "rechargable lithium batteries"
run_metadata.data_logger.power_source.id = None
run_metadata.data_logger.power_source.type = "Li 30 Amp-hr"
run_metadata.data_logger.power_source.voltage.end = 15.3
run_metadata.data_logger.power_source.voltage.start = 17.0
run_metadata.data_logger.timing_system.comments = "internal clock updated by GPS timing"
run_metadata.data_logger.timing_system.drift = 0.0
run_metadata.data_logger.timing_system.type = "GPS lock"
run_metadata.data_logger.timing_system.uncertainty = 0.0
run_metadata.data_logger.type = "MT"
run_metadata.metadata_by.author = "Jared Peacock"
run_metadata.metadata_by.comments = "Most pulled from Z3D files, the rest from written field notes."
run_metadata.metadata_by.organization = "U.S. Geological Survey"

## Electric Channel Metadata

In [9]:
electric_metadata = Electric()
electric_metadata.negative.manufacturer = "Borin"
electric_metadata.negative.model = "Stelth1"
electric_metadata.negative.type = "Ag-AgCl"
electric_metadata.positive.manufacturer = "Borin"
electric_metadata.positive.model = "Stelth1"
electric_metadata.positive.type = "Ag-AgCl"
electric_metadata.type = "electric"
electric_metadata.units = "digital counts"

## Magnetic Channel Metadata
Already updated from Z3D

## Create Station MTH5s

1. Loop over each folder in the directory, make sure that it is a station
2. Save transfer function 
3. Move MTH5 to archive directory


In [10]:
station_list = [
    ss.name
    for ss in data_directory_2021.iterdir()
    if ss.is_dir() and ss.name.startswith("gz")
]
print(station_list)

['gz201', 'gz202', 'gz203', 'gz204', 'gz205', 'gz206', 'gz207', 'gz208', 'gz210', 'gz211', 'gz212', 'gz213', 'gz214', 'gz215', 'gz232', 'gz301', 'gz302', 'gz303', 'gz304', 'gz305', 'gz306', 'gz307', 'gz308', 'gz309', 'gz310', 'gz311', 'gz312', 'gz313', 'gz314', 'gz315', 'gz316', 'gz317', 'gz318', 'gz319', 'gz320', 'gz321', 'gz322', 'gz323', 'gz324', 'gz325', 'gz326', 'gz327', 'gz328', 'gz329', 'gz330', 'gz331', 'gz332', 'gz334', 'gz335', 'gz337', 'gz338', 'gz345', 'gz346', 'gz348', 'gz349', 'gz350']


In [11]:
def get_instrument_id(station, df):
    """
    Get instrument ID from survey summary

    Parameters
    ----------
    station : _type_
        _description_
    year : _type_
        _description_
    df : _type_
        _description_
    """

    if station in df.station.to_list():
        station = station
    elif f"gz{station}" in df.station.to_list():
        station = f"gz{station}"
    elif station.replace("gz", "") in df.station.to_list():
        station = station.replace("gz", "")
    elif int(station.replace("gz", "")) in df.station.to_list():
        station = int(station.replace("gz", ""))
    else:
        logger.error(f"Could not find station {station} in data frame")
        return None
    return df[df.station == station].instrument_id.values[0]

In [None]:
survey_id = "CEC_Geysers"
for station in ["gz306"]: #station_list[2:]:
    new_station_base = change_station_name(station)
    mth5_path = archive_path.joinpath(f"{new_station_base}.h5")

    # setup initial client
    zen_client = ZenClient(
        path_dict[2021]["data"],
        [4096, 256],
        save_path=mth5_path.parent,
        mth5_filename=mth5_path.name,
        calibration_path=calibration_path,
    )
    try:
        with MTH5(**zen_client.h5_kwargs) as m:
            m = m.open_mth5(mth5_path)
            # loop over year
            for year in [2021, 2022, 2023]:
                station_path = path_dict[year]["data"].joinpath(station)

                if station_path.exists():
                    # change name to what the transfer functions are


                    new_station_name = change_station_name(station, str(year)[2:])
                    # change data path in zen client
                    zen_client.collection.file_path = station_path
                    # get run dictionary
                    runs = zen_client.get_run_dict()
                    # create survey group
                    survey_group = m.add_survey(f"{survey_id}_{year}")
                    survey_group.metadata.update(survey_metadata)
                    survey_group.write_metadata()


                    # loop over stations in runs, should only be one run


                    for station_id, station_dict in runs.items():
                        # add group with new station name
                        station_group = survey_group.stations_group.add_station(
                            new_station_name
                        )
                        # update from internal metadata
                        station_group.metadata.update(
                            zen_client.collection.station_metadata_dict[station_id]
                        )
                        # update from external metadata
                        station_group.metadata.update(station_metadata)
                        station_group.metadata.id = new_station_name
                        station_group.write_metadata()

                        # loop over runs
                        run_list = []
                        for run_id, run_df in station_dict.items():
                            # add run and update metadata
                            run_group = station_group.add_run(run_id)
                            run_group.metadata.update(run_metadata)
                            run_group.metadata.data_logger.id = get_instrument_id(
                                station_id, path_dict[year]["df"]
                            )
                            run_group.write_metadata()

                            # loop over channels
                            for row in run_df.itertuples():
                                ch_ts = read_file(
                                    row.fn,
                                    calibration_fn=row.calibration_fn,
                                )
                                # update from external metadata if electric channels
                                if ch_ts.component in ["ex"]:
                                    ch_ts.channel_metadata.measurement_azimuth = 0
                                elif ch_ts.component in ["ey"]:
                                    ch_ts.channel_metadata.measurement_azimuth = 90
                                elif ch_ts.component in ["hx"]:
                                    ch_ts.channel_metadata.measurement_azimuth = 0
                                elif ch_ts.component in ["hy"]:
                                    ch_ts.channel_metadata.measurement_azimuth = 90
                                if ch_ts.component in ["ex", "ey"]:
                                    ch_ts.channel_metadata.update(electric_metadata)
                                run_group.from_channel_ts(ch_ts)
                            # update run metadata from channel information
                            run_group.update_metadata()
                            run_list.append(run_group.to_runts())

                        # Combine runs and down sample to 1 second.
                        combined_run = run_list[0].merge(run_list[1:], new_sample_rate=1)
                        combined_run.run_metadata.id = "sr1_0001"
                        combined_run_group = station_group.add_run("sr1_0001")
                        combined_run_group.metadata.update(run_metadata)
                        combined_run_group.metadata.data_logger.id = get_instrument_id(
                            station_id, path_dict[year]["df"]
                        )
                        combined_run_group.from_runts(combined_run)
                        combined_run_group.update_metadata()
                        station_group.update_metadata()
                    survey_group.update_metadata()
                    
                    logger.info(f"Created {mth5_path}")

                    ### add in transfer function
                    edi_fn = path_dict[year]["edi"].joinpath(
                        f"USGS-GMEG.{year}.{new_station_name}.edi"
                    )
                    if edi_fn.exists():
                        mt_obj = MT()
                        mt_obj.read(edi_fn)
                        mt_obj.survey = f"{survey_id}_{year}"
                        mt_obj.station = new_station_name
                        mt_obj.tf_id = mt_obj.station
                        m.add_transfer_function(mt_obj)
                        logger.info(f"Added TF {mt_obj.station} to MTH5 {mth5_path.name}")
                    else:
                        logger.warning(
                            f"Could not find transfer function for {station}, aka {new_station_name}"
                        )
    except Exception as e:
        logger.error(f"Error with {station} {year}: {e}")
        continue

[1m25:04:15T16:20:37 | INFO | line:677 |mth5.mth5 | _initialize_file | Initialized MTH5 0.2.0 file c:\Users\jpeacock\OneDrive - DOI\MTData\archive\gz57.h5 in mode a[0m
[1m25:04:15T16:20:57 | INFO | line:351 |mth5.timeseries.run_ts | _align_channels | Channels do not have a common end, using latest: 2021-04-07T07:09:43.998047000[0m
[1m25:04:15T16:21:08 | INFO | line:351 |mth5.timeseries.run_ts | _align_channels | Channels do not have a common end, using latest: 2021-04-07T13:09:43.998047000[0m
[1m25:04:15T16:21:13 | INFO | line:351 |mth5.timeseries.run_ts | _align_channels | Channels do not have a common end, using latest: 2021-04-07T16:26:10.996094000[0m
[1m25:04:15T16:21:20 | INFO | line:99 |__main__ | <module> | Created c:\Users\jpeacock\OneDrive - DOI\MTData\archive\gz57.h5[0m
[1m25:04:15T16:21:20 | INFO | line:112 |__main__ | <module> | Added TF gz2157 to MTH5 gz57.h5[0m
[1m25:04:15T16:21:28 | INFO | line:351 |mth5.timeseries.run_ts | _align_channels | Channels do not 

In [13]:
def change_station_name_back(name, prefix: str=""):
    """
    change station name to be gz{year}{location}
    :param name: DESCRIPTION
    :type name: TYPE
    :return: DESCRIPTION
    :rtype: TYPE

    """
    name = name.replace(f"gz{prefix}", "gz3")
    old_number = int(name.replace("gz3", ""))
    if  old_number > 50:
        name = f"gz2{old_number-50}"
    return name

In [14]:
# for h5_fn in archive_path.glob("*.h5"):
#     with MTH5() as m:
#         m = m.open_mth5(h5_fn)
#         for run_row in m.run_summary.itertuples():
#             run_group = m.from_reference(run_row.run_hdf5_reference)
#             year = int(run_row.survey.split("_")[-1])
#             old_station = change_station_name_back(run_row.station, str(year)[-2:])
#             instrument_id = get_instrument_id(old_station, path_dict[year]["df"])
#             if instrument_id is not None:
#                 run_group.metadata.data_logger.id = instrument_id
#                 run_group.write_metadata

In [17]:
station = "gz310"
repeat_station = "gz3102"
new_station_base = change_station_name(station)
mth5_path = archive_path.joinpath(f"{new_station_base}.h5")

# setup initial client
zen_client = ZenClient(
    path_dict[2023]["data"].joinpath(repeat_station),
    [4096, 256],
    save_path=mth5_path.parent,
    mth5_filename=mth5_path.name,
    calibration_path=calibration_path,
)

runs = zen_client.get_run_dict()

with MTH5() as m:
    m.open_mth5(mth5_path, "a")

    for station_id, station_dict in runs.items():
        new_station_name = change_station_name(station, str(year)[2:])
        survey_id = "CEC_Geysers_2023"
        survey_group = m.add_survey(survey_id)
        survey_group.metadata.update(survey_metadata)
        survey_group.write_metadata()

        station_group = survey_group.stations_group.add_station(new_station_name)
        station_group.metadata.update(
            zen_client.collection.station_metadata_dict[station_id]
        )
        station_group.metadata.id = new_station_name
        station_group.metadata.update(station_metadata)
        station_group.write_metadata()

        run_list = []
        for run_id, run_df in station_dict.items():
            run_parts = run_id.split("_")
            run_id = f"{run_parts[0]}_{int(run_parts[1])+ 20:04}"
            run_group = station_group.add_run(run_id)
            run_group.metadata.update(run_metadata)
            run_group.metadata.data_logger.id = get_instrument_id(
                station_id, path_dict[year]["df"]
            )
            run_group.write_metadata()
            for row in run_df.itertuples():
                ch_ts = read_file(
                    row.fn,
                    calibration_fn=row.calibration_fn,
                )
                # update from external metadata if electric channels
                if ch_ts.component in ["ex"]:
                    ch_ts.channel_metadata.measurement_azimuth = 0
                elif ch_ts.component in ["ey"]:
                    ch_ts.channel_metadata.measurement_azimuth = 90
                elif ch_ts.component in ["hx"]:
                    ch_ts.channel_metadata.measurement_azimuth = 0
                elif ch_ts.component in ["hy"]:
                    ch_ts.channel_metadata.measurement_azimuth = 90
                if ch_ts.component in ["ex", "ey"]:
                    ch_ts.channel_metadata.update(electric_metadata)
                run_group.from_channel_ts(ch_ts)
            run_group.update_metadata()
            run_list.append(run_group.to_runts())

        # Combine runs and down sample to 1 second.
        combined_run = run_list[0].merge(run_list[1:], new_sample_rate=1)
        combined_run.run_metadata.id = "sr1_0002"
        combined_run_group = station_group.add_run(combined_run.run_metadata.id)
        combined_run_group.metadata.update(run_metadata)
        combined_run_group.metadata.data_logger.id = get_instrument_id(
                            station_id, path_dict[year]["df"]
                        )
        combined_run_group.from_runts(combined_run)
        combined_run_group.update_metadata()
        station_group.update_metadata()
    survey_group.update_metadata()

    ### add in transfer function
    edi_fn = path_dict[year]["edi"].joinpath(
        f"USGS-GMEG.{year}.{new_station_name}_repeat.edi"
    )
    if edi_fn.exists():
        mt_obj = MT()
        mt_obj.read(edi_fn)
        mt_obj.station = new_station_name
        mt_obj.tf_id = mt_obj.tf_id
        m.add_transfer_function(mt_obj)
        logger.info(f"Added TF {mt_obj.station} to MTH5 {mth5_path.name}")


[1m25:04:15T16:32:13 | INFO | line:281 |mth5.groups.survey | add_survey | survey CEC_Geysers_2023 already exists, returning existing group.[0m
[1m25:04:15T16:32:14 | INFO | line:331 |mth5.groups.base | _add_group | StationGroup gz2310 already exists, returning existing group.[0m
[1m25:04:15T16:32:22 | INFO | line:351 |mth5.timeseries.run_ts | _align_channels | Channels do not have a common end, using latest: 2023-05-05T19:09:41.998047000[0m
[1m25:04:15T16:32:36 | INFO | line:351 |mth5.timeseries.run_ts | _align_channels | Channels do not have a common end, using latest: 2023-05-06T01:09:41.998047000[0m
[1m25:04:15T16:32:50 | INFO | line:351 |mth5.timeseries.run_ts | _align_channels | Channels do not have a common end, using latest: 2023-05-06T07:09:41.998047000[0m
[1m25:04:15T16:33:04 | INFO | line:351 |mth5.timeseries.run_ts | _align_channels | Channels do not have a common end, using latest: 2023-05-06T13:09:41.998047000[0m
[1m25:04:15T16:33:18 | INFO | line:88 |__main__