In [1]:
import toyplot
import requests
import pandas as pd

In [2]:
# store the base url as a string variable
baseurl = "http://api.gbif.org/v1/occurrence/search?"


In [3]:
# store a endpoint request as a string variable
search_url = "http://api.gbif.org/v1/occurrence/search?q=Bombus"

In [4]:
# create a Response instance from a request
response = requests.get(search_url)

In [5]:
# check that your request worked (200 = worked; other codes No))
response.status_code

200

In [6]:
# or, run this to check if it worked.
# This would return an error message if it didn't work (else None)
response.raise_for_status()

In [7]:
# first 500 characters of the .text string from GBIF API query
response.text[:500]

'{"offset":0,"limit":20,"endOfRecords":false,"count":3924237,"results":[{"key":1792076088,"datasetKey":"4fa7b334-ce0d-4e88-aaae-2e0c138d049e","publishingOrgKey":"e2e717bf-551a-4917-bdc9-4fa0f342c530","installationKey":"7182d304-b0a2-404b-baba-2086a325c221","hostingOrganizationKey":"e2e717bf-551a-4917-bdc9-4fa0f342c530","publishingCountry":"EC","protocol":"DWC_ARCHIVE","lastCrawled":"2024-09-27T13:35:39.907+00:00","lastParsed":"2025-02-05T01:03:29.507+00:00","crawlId":20,"extensions":{},"basisOfRe'

In [10]:
# or, get results as a dictionary (JSON converted)
rdict = response.json()

# get some quick info on the dictionary keys
list(rdict.keys())

['offset', 'limit', 'endOfRecords', 'count', 'results', 'facets']

In [11]:
## how many records are there for this query
rdict["count"]

3924237

In [12]:
## how many records were returned
rdict["limit"]

20

In [13]:
## starting from which record
rdict["offset"]

0

In [14]:
# here is the first record, it's also a dictionary
rdict["results"][0]

{'key': 1792076088,
 'datasetKey': '4fa7b334-ce0d-4e88-aaae-2e0c138d049e',
 'publishingOrgKey': 'e2e717bf-551a-4917-bdc9-4fa0f342c530',
 'installationKey': '7182d304-b0a2-404b-baba-2086a325c221',
 'hostingOrganizationKey': 'e2e717bf-551a-4917-bdc9-4fa0f342c530',
 'publishingCountry': 'EC',
 'protocol': 'DWC_ARCHIVE',
 'lastCrawled': '2024-09-27T13:35:39.907+00:00',
 'lastParsed': '2025-02-05T01:03:29.507+00:00',
 'crawlId': 20,
 'extensions': {},
 'basisOfRecord': 'HUMAN_OBSERVATION',
 'individualCount': 1,
 'occurrenceStatus': 'PRESENT',
 'taxonKey': 5228583,
 'kingdomKey': 1,
 'phylumKey': 44,
 'classKey': 212,
 'orderKey': 1448,
 'familyKey': 5289,
 'genusKey': 2476913,
 'speciesKey': 5228583,
 'acceptedTaxonKey': 5228583,
 'scientificName': 'Chaetocercus bombus Gould, 1871',
 'acceptedScientificName': 'Chaetocercus bombus Gould, 1871',
 'kingdom': 'Animalia',
 'phylum': 'Chordata',
 'order': 'Apodiformes',
 'family': 'Trochilidae',
 'genus': 'Chaetocercus',
 'species': 'Chaetocercu

In [15]:
# load as a dataframe
sdf = pd.json_normalize(rdict['results'])
sdf.head()

Unnamed: 0,key,datasetKey,publishingOrgKey,installationKey,hostingOrganizationKey,publishingCountry,protocol,lastCrawled,lastParsed,crawlId,...,collectionCode,occurrenceID,gadm.level0.gid,gadm.level0.name,gadm.level1.gid,gadm.level1.name,gadm.level2.gid,gadm.level2.name,gadm.level3.gid,gadm.level3.name
0,1792076088,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,e2e717bf-551a-4917-bdc9-4fa0f342c530,7182d304-b0a2-404b-baba-2086a325c221,e2e717bf-551a-4917-bdc9-4fa0f342c530,EC,DWC_ARCHIVE,2024-09-27T13:35:39.907+00:00,2025-02-05T01:03:29.507+00:00,20,...,EBIRD,URN:catalog:CLO:EBIRD:OBS530014911,ECU,Ecuador,ECU.24_1,Zamora Chinchipe,ECU.24.9_1,Zamora,ECU.24.9.7_1,Zamora
1,3580016382,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,e2e717bf-551a-4917-bdc9-4fa0f342c530,7182d304-b0a2-404b-baba-2086a325c221,e2e717bf-551a-4917-bdc9-4fa0f342c530,EC,DWC_ARCHIVE,2024-09-27T13:35:39.907+00:00,2025-02-05T05:16:27.384+00:00,20,...,EBIRD,URN:catalog:CLO:EBIRD:OBS1291231701,ECU,Ecuador,ECU.24_1,Zamora Chinchipe,ECU.24.8_1,Yantzaza,ECU.24.8.3_1,Yantzaza (Yanzatza)
2,1725223274,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,e2e717bf-551a-4917-bdc9-4fa0f342c530,7182d304-b0a2-404b-baba-2086a325c221,e2e717bf-551a-4917-bdc9-4fa0f342c530,PE,DWC_ARCHIVE,2024-09-27T13:35:39.907+00:00,2025-02-05T04:01:36.067+00:00,20,...,EBIRD,URN:catalog:CLO:EBIRD:OBS388078755,PER,Peru,PER.1_1,Amazonas,PER.1.3_1,Chachapoyas,PER.1.3.12_1,Magdalena
3,4355556963,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,e2e717bf-551a-4917-bdc9-4fa0f342c530,7182d304-b0a2-404b-baba-2086a325c221,e2e717bf-551a-4917-bdc9-4fa0f342c530,PE,DWC_ARCHIVE,2024-09-27T13:35:39.907+00:00,2025-02-05T05:20:08.282+00:00,20,...,EBIRD,URN:catalog:CLO:EBIRD:OBS1339820584,PER,Peru,PER.23_1,San Martín,PER.23.8_1,Rioja,PER.23.8.4_1,Pardo Miguel
4,3560221744,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,e2e717bf-551a-4917-bdc9-4fa0f342c530,7182d304-b0a2-404b-baba-2086a325c221,e2e717bf-551a-4917-bdc9-4fa0f342c530,EC,DWC_ARCHIVE,2024-09-27T13:35:39.907+00:00,2025-02-05T02:28:36.392+00:00,20,...,EBIRD,URN:catalog:CLO:EBIRD:OBS1295620793,ECU,Ecuador,ECU.7_1,El Oro,ECU.7.11_1,Piñas,ECU.7.11.3_1,Moromoro (Cab. En El Vado)


In [16]:
sdf.columns

Index(['key', 'datasetKey', 'publishingOrgKey', 'installationKey',
       'hostingOrganizationKey', 'publishingCountry', 'protocol',
       'lastCrawled', 'lastParsed', 'crawlId', 'basisOfRecord',
       'individualCount', 'occurrenceStatus', 'taxonKey', 'kingdomKey',
       'phylumKey', 'classKey', 'orderKey', 'familyKey', 'genusKey',
       'speciesKey', 'acceptedTaxonKey', 'scientificName',
       'acceptedScientificName', 'kingdom', 'phylum', 'order', 'family',
       'genus', 'species', 'genericName', 'specificEpithet', 'taxonRank',
       'taxonomicStatus', 'iucnRedListCategory', 'decimalLatitude',
       'decimalLongitude', 'continent', 'stateProvince', 'year', 'month',
       'day', 'eventDate', 'startDayOfYear', 'endDayOfYear', 'issues',
       'lastInterpreted', 'license', 'isSequenced', 'identifiers', 'media',
       'facts', 'relations', 'isInCluster', 'recordedBy', 'geodeticDatum',
       'class', 'countryCode', 'recordedByIDs', 'identifiedByIDs',
       'gbifRegion', 'cou

In [17]:
# previously we wrote this request by hand
urlpath = "http://api.gbif.org/v1/occurrence/search?q=Bombus"

In [18]:
requests.get(urlpath).text
#typing in by hand

'{"offset":0,"limit":20,"endOfRecords":false,"count":3924237,"results":[{"key":1792076088,"datasetKey":"4fa7b334-ce0d-4e88-aaae-2e0c138d049e","publishingOrgKey":"e2e717bf-551a-4917-bdc9-4fa0f342c530","installationKey":"7182d304-b0a2-404b-baba-2086a325c221","hostingOrganizationKey":"e2e717bf-551a-4917-bdc9-4fa0f342c530","publishingCountry":"EC","protocol":"DWC_ARCHIVE","lastCrawled":"2024-09-27T13:35:39.907+00:00","lastParsed":"2025-02-05T01:03:29.507+00:00","crawlId":20,"extensions":{},"basisOfRecord":"HUMAN_OBSERVATION","individualCount":1,"occurrenceStatus":"PRESENT","taxonKey":5228583,"kingdomKey":1,"phylumKey":44,"classKey":212,"orderKey":1448,"familyKey":5289,"genusKey":2476913,"speciesKey":5228583,"acceptedTaxonKey":5228583,"scientificName":"Chaetocercus bombus Gould, 1871","acceptedScientificName":"Chaetocercus bombus Gould, 1871","kingdom":"Animalia","phylum":"Chordata","order":"Apodiformes","family":"Trochilidae","genus":"Chaetocercus","species":"Chaetocercus bombus","genericN

In [19]:
#That was not very nice
#format parameters as a dictionary
# here we create the same urlpath using params
response = requests.get(
    url="https://api.gbif.org/v1/occurrence/search/",
    params={"q": "Bombus"}
)

# show url path
print(response.url)


https://api.gbif.org/v1/occurrence/search/?q=Bombus


In [21]:
res = response.json()
sdf = pd.json_normalize(dict['results'])


In [22]:
# get taxonomy info for the genus Bombus
#queries only where bombus is specifically in genus
res = requests.get(
    url="https://api.gbif.org/v1/species/match/",
    params={"genus": "Bombus"},
)
res.json()
#information getting back is one hit - can use genus key value for next search to declutter results
#genus key is 1340278

{'usageKey': 1340278,
 'scientificName': 'Bombus Latreille, 1802',
 'canonicalName': 'Bombus',
 'rank': 'GENUS',
 'status': 'ACCEPTED',
 'confidence': 94,
 'matchType': 'EXACT',
 'kingdom': 'Animalia',
 'phylum': 'Arthropoda',
 'order': 'Hymenoptera',
 'family': 'Apidae',
 'genus': 'Bombus',
 'kingdomKey': 1,
 'phylumKey': 54,
 'classKey': 216,
 'orderKey': 1457,
 'familyKey': 4334,
 'genusKey': 1340278,
 'synonym': False,
 'class': 'Insecta'}

In [23]:
# get taxonomy info for the genus Pedicularis
res = requests.get(
    url="https://api.gbif.org/v1/species/match/",
    params={"genus": "Pedicularis"},
)
res.json()

{'usageKey': 3171670,
 'scientificName': 'Pedicularis L.',
 'canonicalName': 'Pedicularis',
 'rank': 'GENUS',
 'status': 'ACCEPTED',
 'confidence': 95,
 'matchType': 'EXACT',
 'kingdom': 'Plantae',
 'phylum': 'Tracheophyta',
 'order': 'Lamiales',
 'family': 'Orobanchaceae',
 'genus': 'Pedicularis',
 'kingdomKey': 6,
 'phylumKey': 7707728,
 'classKey': 220,
 'orderKey': 408,
 'familyKey': 6651,
 'genusKey': 3171670,
 'synonym': False,
 'class': 'Magnoliopsida'}

In [24]:
# add requirement that the record have coordinate data
res = requests.get(
    url="https://api.gbif.org/v1/occurrence/search/",
    params={
        "genusKey": 1340278, 
        "hasCoordinate": "true",
    }
)
res.url
#hasCoordinate is location data

'https://api.gbif.org/v1/occurrence/search/?genusKey=1340278&hasCoordinate=true'

In [30]:
# request records 0-100
res = requests.get(
    url="https://api.gbif.org/v1/occurrence/search/",
    params={
        "genusKey": 1340278, 
        "hasCoordinate": "true",
        "offset": 100,
        "limit": 2,
    }
)
res.url
res.json()
#by default gives limit 20 - in get call can change offset and limit
#offset 100, limit 2: gives 101 and 102 records
#limit max is 300 - can write a for loop to iterate to get all data
#limits on amount per request to not abuse database and put strain on requesting terrabytes of data

{'offset': 100,
 'limit': 2,
 'endOfRecords': False,
 'count': 3517205,
 'results': [{'key': 5007561247,
   'datasetKey': '50c9509d-22c7-4a22-a47d-8c48425ef4a7',
   'publishingOrgKey': '28eb1a3f-1c15-4a95-931a-4af90ecb574d',
   'installationKey': '997448a8-f762-11e1-a439-00145eb45e9a',
   'hostingOrganizationKey': '28eb1a3f-1c15-4a95-931a-4af90ecb574d',
   'publishingCountry': 'US',
   'protocol': 'DWC_ARCHIVE',
   'lastCrawled': '2025-02-16T04:52:36.717+00:00',
   'lastParsed': '2025-02-16T15:54:27.861+00:00',
   'crawlId': 518,
   'projectId': 'https://www.inaturalist.org/projects/texas-invertebrate-species-of-conservation-need',
   'extensions': {'http://rs.gbif.org/terms/1.0/Multimedia': [{'http://purl.org/dc/terms/created': '2025-01-04T23:12:41Z',
      'http://purl.org/dc/terms/identifier': 'https://inaturalist-open-data.s3.amazonaws.com/photos/461751082/original.jpg',
      'http://purl.org/dc/terms/references': 'https://www.inaturalist.org/photos/461751082',
      'http://purl.

In [32]:
res = requests.get(
    url="https://api.gbif.org/v1/occurrence/search/",
    params={
        "genusKey": 1340278, 
        "year": "1900,1910", 
        #dates of records
        "basisOfRecord": "PRESERVED_SPECIMEN",
        #physical specimen
        "hasCoordinate": "true",
        #has location dara
        "hasGeospatialIssue": "false",
        "country": "US",
        #records in US
    },
)

print(res.json()["count"])
#still 7373 hits in database even though specific

7373


In [33]:
#Tutorial 12.1
vcf_url = "https://raw.githubusercontent.com/isaacovercast/easySFS/refs/heads/master/example_files/wcs_1200.vcf"

In [34]:
vcf_response = requests.get(vcf_url)

In [37]:
vcf_response.status_code

200

In [38]:
vcf_response.text[:500]

'##fileformat=VCFv4.0\n##Tassel=<ID=GenotypeTable,Version=5,Description="Reference allele is not known. The major allele was used as reference allele">\n##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the reference and alternate alleles in the order listed">\n##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">\n##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype '

In [42]:
pd.read_csv(vcf_response)

ValueError: Invalid file path or buffer object type: <class 'requests.models.Response'>

In [48]:
with open("wcs.vcf", 'w') as outfile:
    outfile.write(res.text)
lines = open('wcs.vcf').readlines()
for idx, line in enumerate(lines):
    if "CHROM" in line:
        print(idx)
#knew CHROM was header by opening url and visualizing vcf

In [49]:
pd.read_csv(vcf_response, header=10, sep ="/t")

ValueError: Invalid file path or buffer object type: <class 'requests.models.Response'>

In [None]:
missing = (df)
#parse out 