In [5]:
import pandas as pd
import requests
import pdb
from datetime import datetime, timedelta
from xml.etree import ElementTree as ET
from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from time import perf_counter

In [6]:
# idea from https://stackoverflow.com/a/76596818
def process_quakeml(responseContent):
    path = []
    rows = [] # no outer dict because some events don't have some tag (different columns -> rows don't match)
    parser = ET.XMLPullParser(events=("start","end"))
    parser.feed(responseContent)
    
    for event, element in parser.read_events():
        key = element.tag[element.tag.rfind("}")+1:] # gets rid of namespace
        exclude_tags = ["quakeml", "eventParameters", "event"] # useless info
        if event == "start" and key == "event":
            row = {} # function scope
            path.append(key)
        elif event == "end" and key == "event": # end of an earthquake event
            rows.append(row)
            path.pop()
        else:
            if event == "start" and key not in exclude_tags:
                path.append(key)
            elif event == "end" and key not in exclude_tags and "event" in path: # needs to be an event in path, no metadata
                current_path = "/".join(path)
                row[current_path] = element.text
                path.pop()    
    return rows

def extract_quakeml(startTime, endTime):
    dfList = []
    session = requests.Session()
    adapter = requests.adapters.HTTPAdapter(pool_maxsize=1000) # check
    session.mount('https://', adapter)
    parameters = {"format":"quakeml", "starttime":startTime, "endtime":endTime, "limit":20000, "minmagnitude":0, 'orderby':'time-asc', 'eventtype':'earthquake'}
    
    while (parameters["starttime"] <= endTime): 
        response = session.get("https://earthquake.usgs.gov/fdsnws/event/1/query", params=parameters)
        if response.status_code == 204:
            break
        elif response.status_code == 503: 
            print("too much")
            timeList = split_time(parameters["starttime"], endTime, timedelta(days=365/12), 6) # check
                
            for i in range(len(timeList)-1): 
                parameters["starttime"] = timeList[i]
                parameters["endtime"] = timeList[i+1]
                response = session.get("https://earthquake.usgs.gov/fdsnws/event/1/query", params=parameters)
                dfList.append(pd.Dataframe(process_quakeml(response.content)))
            break # went through time list; end query
        else:
            start = perf_counter()
            rows = process_quakeml(response.content)
            end = perf_counter()
            print(f"timer {end-start}")
            parameters["starttime"] = datetime.strptime(rows[-1]["event/origin/time/value"], "%Y-%m-%dT%H:%M:%S.%fZ") # time of youngest row (earthquake): start of new query  
            dfList.append(pd.DataFrame(rows))
            if len(rows) < 20000: # reached the end of the query
                break
        print("process end: %s" % parameters["endtime"])
    if len(dfList) == 0:
        return None
    else:
        return pd.concat(dfList, axis=0, ignore_index=True) 

In [7]:
def process_json(features):
    rows = []        
    for index in range(len(features)):
        earthquake = features[index]
        prop = earthquake['properties']
        coor = earthquake['geometry']['coordinates']
        rows.append([earthquake['id'],coor[0],coor[1],coor[2],prop['mag'],prop['place'],prop['time'],prop['updated'],prop['tz'],prop['url'],
                     prop['detail'],prop['felt'],prop['cdi'],prop['mmi'],prop['alert'],prop['status'],prop['tsunami'],prop['sig'],prop['net'],
                     prop['code'],prop['ids'],prop['sources'],prop['types'],prop['nst'],prop['dmin'],prop['rms'],prop['gap'],prop['magType'],
                     prop['type'],prop['title']])
        
    return pd.DataFrame(rows, columns=['id','longitude','latitude','depth','mag','place','time','updated','tz','url','detail','felt','cdi',
                                                  'mmi','alert','status','tsunami','sig','net','code','ids','sources','types','nst','dmin','rms',
                                                  'gap','magType','title','type'])
def extract_json(startTime, endTime):
    dfList = []
    session = requests.Session()
    adapter = requests.adapters.HTTPAdapter(pool_maxsize=250) 
    session.mount('https://', adapter)
    parameters = {"format":"geojson", "starttime":startTime, "endtime":endTime, "limit":20000, "minmagnitude":0, 'orderby':'time-asc', 'eventtype':'earthquake'}

    while parameters["starttime"] <= endTime:
        response = session.get("https://earthquake.usgs.gov/fdsnws/event/1/query", params=parameters)
        if response.status_code == 503:
            
            timeList = split_time(parameters["starttime"], endTime, timedelta(days=365/6), 6) 
                
            for i in range(len(timeList)-1): 
                parameters["starttime"] = timeList[i]
                parameters["endtime"] = timeList[i+1]
                response = session.get("https://earthquake.usgs.gov/fdsnws/event/1/query", params=parameters)
                features = response.json()['features']
                dfList.append(process_json(features))
            break # went through time list; end query
        else:
            features = response.json()['features']
            if len(features) == 0: # no results for query; combine dfList
                break
            parameters["starttime"] = datetime.utcfromtimestamp(features[-1]['properties']['time']/1000) # USGS used UTC timezone    
            dfList.append(process_json(features))
            if len(features) < 20000: # the query has reached its last earthquake
                break

    if len(dfList) == 0:
        return None
    else:
        return pd.concat(dfList, axis=0, ignore_index=True) 

# https://alexandra-zaharia.github.io/posts/how-to-return-a-result-from-a-python-thread/
class extract_thread(Thread):
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.dataframe = None

    def run(self):
        self.dataframe = self._target(*self._args, **self._kwargs)

    def join(self, *args, **kwargs):
        super().join(*args, **kwargs)
        #return self.dataframe

# taken from https://www.geeksforgeeks.org/python-divide-date-range-to-n-equal-duration/
# https://stackoverflow.com/a/29721341
def split_time(startTime, endTime, minDelta, divSegment):
    diff = endTime - startTime
    timeList = [startTime]
    while diff > minDelta: # arbitrary (split more recent data)
        segment = diff / divSegment # arbitrary division by 6
        diff = diff - segment 
        startTime = startTime + segment
        timeList.append(startTime)
    timeList.append(endTime)
    print("time %d" % len(timeList))
    return timeList


In [None]:
def get_data():
    startTime, endTime = datetime(1568, 1, 1, 0, 0, 0), datetime(2024, 6, 5) # 1568 get first record (quakeml)
    timeList = split_time(startTime, endTime, timedelta(days=365/12), 6)
    quakeThreads = []
    jsonThreads = []

    with ThreadPoolExecutor() as executor:
        executor.map(extract_quakeml, timeList)
    for i in range(len(timeList)-1): # minus one no index range error
        quakeThread = extract_thread(target=extract_quakeml, args=(timeList[i], timeList[i+1]))
        #jsonThread = extract_thread(target=extract_json, args=(timeList[i],timeList[i+1]))
        quakeThreads.append(quakeThread)
        #jsonThreads.append(jsonThread)
        quakeThread.start()
        #jsonThread.start()

    quakeList = []
    jsonList = []
    
    for thread in quakeThreads:
        thread.join()
        quakeList.append(thread.dataframe)

    '''for thread in jsonThreads:
        thread.join()
        jsonList.append(thread.dataframe)'''

    quakeDf = pd.concat(quakeList, ignore_index=True)
    #jsonDf = pd.concat(jsonList, ignore_index=True)
    print(len(quakeDf.index))
    #print(len(jsonDf))
    #jsonDf.drop_duplicates(inplace=True) # dates do overlap although miniscule
    quakeDf.drop_duplicates(inplace=True)
    print(len(quakeDf.index))
    #print(len(jsonDf))
    #jsonDf.reset_index(drop=True, inplace = True)
    return quakeDf.reset_index(inplace=True)
    
    '''
    # These are all NaN's
    df.drop("felt", axis=1, inplace=True)
    df.drop("cdi", axis=1, inplace=True) # max intensity (dyfi)
    df.drop("mmi", axis=1, inplace=True) # max instrumental intensity (shakemap)
    df.drop("alert", axis=1, inplace=True) # not useful
    
    
    df.drop("type", axis=1, inplace=True) # redundant
    df.drop("place", axis=1, inplace=True) # no need to have a reference point when long and lat are provided
    df.dropna(inplace=Tp rowrue) # drop all rows with a missing value
    
    for i, r in df.iterrows(): # accurate reviewed data
        if r["status"] == "automatic" or r["status"] == "deleted":
            df.drop(index=i, inplace=True)
    df.drop("status", axis=1, inplace=True) # redundant
    df["time"] = pd.to_datetime(df["time"]) # convert object to datetime
    
    netlocmag_identical = True
    for i, r in df.iterrows(): # check for redundancy in columns
        if r["locationSource"] != r["magSource"] or r["locationSource"] != r["net"]:
            netlocmag_identical = False
    if netlocmag_identical: # rename the column to combine and drop the others
        df.rename(columns={"net": "netlocmagSource"}, inplace=True) 
        df.drop("magSource", axis=1, inplace=True)
        df.drop("locationSource", axis=1, inplace=True)'''
    
hi = get_data()

time 50
timer 0.0014440559898503125
timer 0.0018358289962634444
timer 0.005612892971839756
timer 0.015079768025316298
timer 0.036731676023919135
timer 0.024372494022827595
timer 0.9680604099994525
timer 0.0002481769770383835
timer 1.8405761569738388
timer 0.10714191698934883
timer 1.1278408269863576
timer 1.495743635983672
timer 0.48575681500369683
timer 1.1275144839892164
timer 2.0450863240403123
timer 1.9271240139496513
timer 2.2500533770071343
timer 3.0057003880501725
timer 2.1156404219800606
timer 3.379306143033318
timer 2.3516680260072462
timer 5.228326611046214
timer 4.379973126982804
timer 8.672872475988697
process end: 2018-09-07 23:21:45.239663
timer 5.5808700260240585
timer 3.971298646996729
process end: 2016-02-28 06:45:43.545114
timer 8.639319517998956
process end: 1999-09-28 04:04:16.284215
timer 3.753787084016949
timer 5.867406991019379
process end: 2022-10-28 18:12:03.463394
timer 6.473106631019618
process end: 2021-02-07 10:37:52.013694
timer 13.502980375953484
process 