In [1]:
import pandas as pd
import requests
import pdb
from datetime import datetime
from xml.etree import ElementTree as ET

In [2]:
def extract_json(startTime, endTime):
    newStartTime = startTime
    dfList = []
 
    while (newStartTime <= endTime): 
        parameters = {"format":"geojson", "starttime":newStartTime, "endtime":endTime, "limit":20000, "minmagnitude":0, 'orderby':'time-asc', 'eventtype':'earthquake'}
        response = requests.get("https://earthquake.usgs.gov/fdsnws/event/1/query", params=parameters)
        features = response.json()['features']
        newStartTime = datetime.utcfromtimestamp(features[-1]['properties']['time']/1000) # USGS used UTC timezone
            
        rows = []        
        for index in range(len(features)):
            earthquake = features[index]
            prop = earthquake['properties']
            coor = earthquake['geometry']['coordinates']
            rows.append([earthquake['id'],coor[0],coor[1],coor[2],prop['mag'],prop['place'],prop['time'],prop['updated'],prop['tz'],prop['url'],
                         prop['detail'],prop['felt'],prop['cdi'],prop['mmi'],prop['alert'],prop['status'],prop['tsunami'],prop['sig'],prop['net'],
                         prop['code'],prop['ids'],prop['sources'],prop['types'],prop['nst'],prop['dmin'],prop['rms'],prop['gap'],prop['magType'],
                         prop['type'],prop['title']])
            
        dfList.append(pd.DataFrame(rows, columns=['id','longitude','latitude','depth','mag','place','time','updated','tz','url','detail','felt','cdi',
                                                  'mmi','alert','status','tsunami','sig','net','code','ids','sources','types','nst','dmin','rms',
                                                  'gap','magType','title','type']))
        if len(features) < 20000: # the query has reached its last earthquake
            break
    return pd.concat(dfList, ignore_index=True) 


In [8]:
def extract_quakeml(startTime, endTime):
    newStartTime = startTime
    dfList = []
 
    while (newStartTime <= endTime): 
        parameters = {"format":"quakeml", "starttime":newStartTime, "endtime":endTime, "limit":20000, "minmagnitude":0, 'orderby':'time-asc', 'eventtype':'earthquake'}
        response = requests.get("https://earthquake.usgs.gov/fdsnws/event/1/query", params=parameters)
        # idea from https://stackoverflow.com/a/76596818
        path = []
        rows = [] # no outer dict because some events don't have some tag (different columns -> rows don't match)
        parser = ET.XMLPullParser(events=("start","end"))
        parser.feed(response.content)
    
        for event, element in parser.read_events():
            key = element.tag[element.tag.rfind("}")+1:] # gets rid of namespace
            exclude_tags = ["quakeml", "eventParameters", "event"] # useless info
            if event == "start" and key == "event":
                row = {} # function scope
                path.append(key)
            elif event == "end" and key == "event": # end of an earthquake event, don't clear after appending
                rows.append(row)
                path.pop()
            else:
                if event == "start" and key not in exclude_tags:
                    path.append(key)
                elif event == "end" and key not in exclude_tags and "event" in path: # needs to be an event in path, no metadata
                    current_path = "/".join(path)
                    row[current_path] = element.text
                    path.pop()    
      
        newStartTime = datetime.strptime(rows[-1]["event/origin/time/value"], "%Y-%m-%dT%H:%M:%S.%fZ")# time of youngest row (earthquake): start of new query  
        dfList.append(pd.DataFrame(rows))
        if len(rows) < 20000: # reached the end of the query
            break
    return pd.concat(dfList, ignore_index=True) 
    

In [9]:
def get_data():
    df1 = extract_quakeml(datetime(1899, 1, 1, 0, 0),datetime(1970, 1, 1, 0, 0))
    #df2 = extract_json(datetime(1899, 1, 1, 0, 0),datetime(1970, 1, 1, 0, 0))
    return df1
    '''
    # These are all NaN's
    df.drop("felt", axis=1, inplace=True)
    df.drop("cdi", axis=1, inplace=True) # max intensity (dyfi)
    df.drop("mmi", axis=1, inplace=True) # max instrumental intensity (shakemap)
    df.drop("alert", axis=1, inplace=True) # not useful
    
    #df.drop_duplicates(inplace=True) # dates do overlap although miniscule
    df.drop("type", axis=1, inplace=True) # redundant
    df.drop("place", axis=1, inplace=True) # no need to have a reference point when long and lat are provided
    df.dropna(inplace=Tp rowrue) # drop all rows with a missing value
    
    for i, r in df.iterrows(): # accurate reviewed data
        if r["status"] == "automatic" or r["status"] == "deleted":
            df.drop(index=i, inplace=True)
    df.drop("status", axis=1, inplace=True) # redundant
    df["time"] = pd.to_datetime(df["time"]) # convert object to datetime
    
    netlocmag_identical = True
    for i, r in df.iterrows(): # check for redundancy in columns
        if r["locationSource"] != r["magSource"] or r["locationSource"] != r["net"]:
            netlocmag_identical = False
    if netlocmag_identical: # rename the column to combine and drop the others
        df.rename(columns={"net": "netlocmagSource"}, inplace=True) 
        df.drop("magSource", axis=1, inplace=True)
        df.drop("locationSource", axis=1, inplace=True)
    df.reset_index(drop=True, inplace = True)'''

get_data()

Unnamed: 0,event/description/type,event/description/text,event/description,event/origin/time/value,event/origin/time,event/origin/longitude/value,event/origin/longitude,event/origin/latitude/value,event/origin/latitude,event/origin/originUncertainty/horizontalUncertainty,...,event/magnitude/mag/uncertainty,event/origin/creationInfo/version,event/creationInfo/version,event/origin/quality/usedPhaseCount,event/origin/quality/usedStationCount,event/origin/quality/standardError,event/origin/quality/azimuthalGap,event/origin/quality/minimumDistance,event/origin/quality,event/magnitude/stationCount
0,earthquake name,"Wythe County, Virginia",,1899-02-13T09:30:00.000Z,,-81,,37,,0,...,,,,,,,,,,
1,earthquake name,"West of Trinidad, California",,1899-04-16T13:40:00.000Z,,-125.8,,41,,0,...,,,,,,,,,,
2,earthquake name,"Wabash Valley, near Princeton, Indiana",,1899-04-30T02:05:00.000Z,,-87.7,,38.3,,0,...,,,,,,,,,,
3,earthquake name,"Near San Juan Bautista, California",,1899-04-30T22:41:00.000Z,,-121.6,,36.85,,0,...,,,,,,,,,,
4,earthquake name,"Near San Francisco, California",,1899-06-02T07:19:00.000Z,,-122.5,,37.7,,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39436,earthquake name,"14 km S of Volcano, Hawaii",,1969-12-31T14:05:18.590Z,,-155.2236667,,19.3151667,,810,...,,1,1,16,16,0.12,204,,,1
39437,earthquake name,"43 km WNW of Naze, Japan",,1969-12-31T19:01:56.250Z,,129.075,,28.532,,0,...,0.2,,,,,,,,,
39438,earthquake name,"6km NE of Calexico, CA",,1969-12-31T22:45:21.770Z,,-115.4575,,32.7185,,2810,...,0.19,1,1,7,4,0.55,212,0.6258,,5
39439,earthquake name,"4km S of La Canada Flintridge, CA",,1970-01-01T00:00:00.000Z,,-118.1850357,,34.1645203,,0,...,,6,6,,,,,,,0
