### Data Ingestion

We will gather Infrasound signals from Mount Merapi through from I06AU, I04AU, I52GB, I07AU, I39PW International Monitoring System (IMS) Infrasound Arrays through IRIS Data Services. The starttime and enddtime can be seen in the table below:

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import pytz
import datetime

from obspy import UTCDateTime
from obspy.clients.fdsn import Client
from datetime import datetime
from obspy.clients.earthworm import Client


plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = 12, 8

In [2]:
# Load csv results from server into a Pandas DataFrame
server = 'https://webservices.volcano.si.edu/geoserver/GVP-VOTW/wms?'
query = 'service=WFS&version=2.0.0&request=GetFeature&typeName=GVP-VOTW:E3WebApp_Eruptions1960&outputFormat=csv'
df = pd.read_csv(server+query)

# drop function which is used in removing or deleting rows or columns from the CSV files
df = df.set_index('Activity_ID')
df = df.drop(columns=['FID', 'LatitudeDecimal', 'LongitudeDecimal', 'GeoLocation'])

# Query a column for a value of interest
df = df.query('VolcanoNumber == 263250')

print(type(df))
df.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,VolcanoNumber,VolcanoName,ExplosivityIndexMax,StartDate,StartDateYear,StartDateMonth,StartDateDay,EndDate,EndDateYear,EndDateMonth,EndDateDay,ContinuingEruption
Activity_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20842,263250,Merapi,3,20131118.0,2013,11.0,18.0,20131118.0,2013.0,11.0,18.0,False
20892,263250,Merapi,3,20140309.0,2014,3.0,9.0,20140420.0,2014.0,4.0,20.0,False
22488,263250,Merapi,1,20110325.0,2011,3.0,25.0,20110908.0,2011.0,9.0,8.0,False
15913,263250,Merapi,3,19610411.0,1961,4.0,11.0,19611128.0,1961.0,11.0,28.0,False
15914,263250,Merapi,2,19670112.0,1967,1.0,12.0,19700702.0,1970.0,7.0,2.0,False


In [3]:
# Define a function to convert start and end dates to UTC datetime format
def convert_to_utc(date_str):
    year = int(date_str[0:4])
    month = int(date_str[4:6])
    day = int(date_str[6:8])
    date = datetime(year, month, day)
    return date.strftime('%Y-%m-%d %H:%M:%S')

# Example usage for the first row in the dataset
start_date = '20180511.0'
end_date = '20200621.0'
start_date_utc = convert_to_utc(start_date)
end_date_utc = convert_to_utc(end_date)
print(start_date_utc, end_date_utc)

2018-05-11 00:00:00 2020-06-21 00:00:00


In [14]:
# Define a function to convert start and end dates to UTC datetime format
def convert_to_utc(date_float):
    date_str = str(int(date_float))
    year = int(date_str[0:4])
    month = int(date_str[4:6])
    day = int(date_str[6:8])
    date = datetime(year, month, day)
    return date.strftime('%Y-%m-%d %H:%M:%S')

# Apply the function to the StartDate and EndDate columns in the DataFrame
df['StartDate'] = df['StartDate'].apply(convert_to_utc)
df['EndDate'] = df['EndDate'].apply(convert_to_utc)

In [15]:
df.to_csv('data/eruption_dates.csv')

OSError: Cannot save file into a non-existent directory: 'data'

In [None]:
eruption_dates = pd.read_csv("data/eruption_dates.csv")

In [None]:
import requests
import pandas as pd
import io

# Set the URL for the availability query
url = "http://service.iris.edu/fdsnws/availability/1/extent"

# Set the query parameters
params = {
    "includerestricted": "true",
    "nodata": "404",
    "network": "IM",
    "starttime": "2011-10-31T00:00:00",
    "endtime": "2599-12-31T23:59:59",
}

# Send the availability query and get the response
response = requests.get(url, params=params)

# Convert the response content to a string and read it into a pandas DataFrame
csv_str = response.content.decode("utf-8")
availability_df = pd.read_csv(io.StringIO(csv_str), delimiter="\s+")

# Save the availability DataFrame to a CSV file
availability_df.to_csv("data_availability.csv", index=False)

# Print a confirmation message
print("Availability data saved to availability.csv")


Availability data saved to availability.csv


In [None]:
from obspy.clients.fdsn import Client
from obspy import UTCDateTime
import pandas as pd

# Define the stations
stations = ["I06H1", "I04H1", "I52H1", "I07H1", "I39H1"]

# Load eruption dates
eruption_dates = pd.read_csv("data/eruption_dates.csv")

# Load data availability for each station
data_availability = pd.read_csv("data/data_availability.csv")

# Loop through each eruption date and each station
for _, Activity_ID in eruption_dates.iterrows():
    for station in stations:
        # Define start and end times for the data request
        start_time = UTCDateTime(Activity_ID["StartDate"])
        end_time = UTCDateTime(Activity_ID["EndDate"])
        
        # Get the availability for the current station
        station_availability = data_availability[data_availability["Station"] == station]
        
        # Check if data is available for the entire time period of the eruption
        if station_availability.empty:
            print(f"No data available for {station} during {eruption_dates['StartDate']} to {eruption_dates['EndDate']}")
            continue
        
        available_start = UTCDateTime(station_availability["Earliest"].iloc[0])
        available_end = UTCDateTime(station_availability["Latest"].iloc[0])
        
        if available_start > start_time:
            # Adjust start time if necessary
            print(f"Adjusting start time for {station} from {start_time} to {available_start}")
            start_time = available_start
        if available_end < end_time:
            # Adjust end time if necessary
            print(f"Adjusting end time for {station} from {end_time} to {available_end}")
            end_time = available_end
        
        # Use ObsPy to request the data for the specified station and time range
        client = Client("IRIS")
        st = client.get_waveforms("IM", station[:2], "*", start_time, end_time)
        
        # Do any necessary processing or analysis on the data here
        
        # Save the data to file or output to the console
        filename = f"{eruption['volcano']}_eruption_{eruption['StartDate']}_{station}.sac"
        st.write(filename, format="SAC")
        print(f"Data saved to {filename}")

Adjusting end time for I06H1 from 2020-06-21T00:00:00.000000Z to 2019-09-02T09:02:59.950000Z


FDSNNoDataException: No data available for request.
HTTP Status code: 204
Detailed response of server:



In [None]:
from obspy.clients.fdsn import Client
from obspy import UTCDateTime
import pandas as pd

# Define the client and the stations
client = Client("IRIS")
network = "IM"
stations = ["I06H1", "I04H1", "I52H1", "I07H1", "I39H1"]

# Load eruption dates
eruption_dates = pd.read_csv("data/eruption_dates.csv")

# Loop through each eruption date and each station
for _, eruption in eruption_dates.iterrows():
    for station in stations:
        # Define start and end times for the data request
        start_time = UTCDateTime(eruption["StartDate"])
        end_time = UTCDateTime(eruption["EndDate"])

        # Use ObsPy to request the data availability for the station
        availability = client.get_availability(network=network, station=station, starttime=start_time, endtime=end_time)

        # Check if data is available for the entire time period of the eruption
        if not availability:
            print(f"No data available for {eruption['volcano']} eruption {eruption['StartDate']} to {eruption['EndDate']} at station {station}")
            continue

        available_start = UTCDateTime(availability[0]["start"])
        available_end = UTCDateTime(availability[0]["end"])

        if available_start > start_time:
            # Adjust start time if necessary
            print(f"Adjusting start time for {station} from {start_time} to {available_start}")
            start_time = available_start
        if available_end < end_time:
            # Adjust end time if necessary
            print(f"Adjusting end time for {station} from {end_time} to {available_end}")
            end_time = available_end

        # Use ObsPy to request the data for the specified station and time range
        st = client.get_waveforms(network, station, "*", "*", start_time, end_time)

        # Do any necessary processing or analysis on the data here

        # Save the data to file or output to the console
        filename = f"{eruption['volcano']}_eruption_{eruption['StartDate']}_{station}.sac"
        st.write(filename, format="SAC")
        print(f"Data saved to {filename}")


AttributeError: 'Client' object has no attribute 'get_availability'

In [None]:
from obspy.clients.fdsn import Client
import pandas as pd

client = Client("IRIS")
network = "IM"
stations = ["I06H*", "I04H*", "I52H*", "I07H*", "I39H*"]

# Get availability information for each station
availability_list = []
for station in stations:
    availability = client.get_availability(network=network, station=station, channel="*", level="response")
    availability_list.extend(availability)

# Convert availability list to a DataFrame
availability_df = pd.DataFrame(availability_list)

# Save the availability information to a CSV file
availability_df.to_csv("availability.csv", index=False)


AttributeError: 'Client' object has no attribute 'get_availability'

In [None]:
from obspy import UTCDateTime
from obspy.clients.earthworm import Client

# Define the network and station codes to query
network = "IM"
station_codes = ["I06H", "I04H", "I52H", "I07H", "I39H"]

# Define the start and end times of the data availability query
start_time = UTCDateTime("2000-01-01")
end_time = UTCDateTime("2023-03-13")

# Create an empty list to hold the earliest and latest data availability for each station
availability = []

# Loop over the station codes and get the earliest and latest data availability for each station
for station_code in station_codes:
    # Create an Earthworm client for the station
    client = Client("rtserve.iris.washington.edu", port=18000, timeout=10)
    
    # Get the earliest and latest data availability for the station
    earliest, latest = client.get_availability(network=network, station=station_code, starttime=start_time, endtime=end_time)
    
    # Add the station code, earliest availability, and latest availability to the list
    availability.append((station_code, earliest, latest))

# Create a Pandas DataFrame from the availability list and save it to a CSV file
df = pd.DataFrame(availability, columns=["station", "earliest", "latest"])
df.to_csv("data/availability_summary.csv", index=False)


TypeError: Client.get_availability() got an unexpected keyword argument 'starttime'