In [1]:
import toyplot
import requests
import pandas as pd

In [2]:
# store the base url as a string variable
baseurl = "http://api.gbif.org/v1/occurrence/search?"


In [3]:
# store a endpoint request as a string variable
search_url = "http://api.gbif.org/v1/occurrence/search?q=Bombus"

In [4]:
# create a Response instance from a request
response = requests.get(search_url)

In [5]:
# check that your request worked (200 = worked; other codes No))
response.status_code

200

In [6]:
# or, run this to check if it worked.
# This would return an error message if it didn't work (else None)
response.raise_for_status()

In [7]:
# first 500 characters of the .text string from GBIF API query
response.text[:500]

'{"offset":0,"limit":20,"endOfRecords":false,"count":3928343,"results":[{"key":1792076088,"datasetKey":"4fa7b334-ce0d-4e88-aaae-2e0c138d049e","publishingOrgKey":"e2e717bf-551a-4917-bdc9-4fa0f342c530","installationKey":"7182d304-b0a2-404b-baba-2086a325c221","hostingOrganizationKey":"e2e717bf-551a-4917-bdc9-4fa0f342c530","publishingCountry":"EC","protocol":"DWC_ARCHIVE","lastCrawled":"2024-09-27T13:35:39.907+00:00","lastParsed":"2025-02-05T01:03:29.507+00:00","crawlId":20,"extensions":{},"basisOfRe'

In [8]:
# or, get results as a dictionary (JSON converted)
rdict = response.json()

# get some quick info on the dictionary keys
list(rdict.keys())

['offset', 'limit', 'endOfRecords', 'count', 'results', 'facets']

In [9]:
## how many records are there for this query
rdict["count"]

3928343

In [10]:
## how many records were returned
rdict["limit"]

20

In [11]:
## starting from which record
rdict["offset"]

0

In [12]:
# here is the first record, it's also a dictionary
rdict["results"][0]

{'key': 1792076088,
 'datasetKey': '4fa7b334-ce0d-4e88-aaae-2e0c138d049e',
 'publishingOrgKey': 'e2e717bf-551a-4917-bdc9-4fa0f342c530',
 'installationKey': '7182d304-b0a2-404b-baba-2086a325c221',
 'hostingOrganizationKey': 'e2e717bf-551a-4917-bdc9-4fa0f342c530',
 'publishingCountry': 'EC',
 'protocol': 'DWC_ARCHIVE',
 'lastCrawled': '2024-09-27T13:35:39.907+00:00',
 'lastParsed': '2025-02-05T01:03:29.507+00:00',
 'crawlId': 20,
 'extensions': {},
 'basisOfRecord': 'HUMAN_OBSERVATION',
 'individualCount': 1,
 'occurrenceStatus': 'PRESENT',
 'taxonKey': 5228583,
 'kingdomKey': 1,
 'phylumKey': 44,
 'classKey': 212,
 'orderKey': 1448,
 'familyKey': 5289,
 'genusKey': 2476913,
 'speciesKey': 5228583,
 'acceptedTaxonKey': 5228583,
 'scientificName': 'Chaetocercus bombus Gould, 1871',
 'acceptedScientificName': 'Chaetocercus bombus Gould, 1871',
 'kingdom': 'Animalia',
 'phylum': 'Chordata',
 'order': 'Apodiformes',
 'family': 'Trochilidae',
 'genus': 'Chaetocercus',
 'species': 'Chaetocercu

In [13]:
# load as a dataframe
sdf = pd.json_normalize(rdict['results'])
sdf.head()

Unnamed: 0,key,datasetKey,publishingOrgKey,installationKey,hostingOrganizationKey,publishingCountry,protocol,lastCrawled,lastParsed,crawlId,...,gbifID,occurrenceID,gadm.level0.gid,gadm.level0.name,gadm.level1.gid,gadm.level1.name,gadm.level2.gid,gadm.level2.name,gadm.level3.gid,gadm.level3.name
0,1792076088,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,e2e717bf-551a-4917-bdc9-4fa0f342c530,7182d304-b0a2-404b-baba-2086a325c221,e2e717bf-551a-4917-bdc9-4fa0f342c530,EC,DWC_ARCHIVE,2024-09-27T13:35:39.907+00:00,2025-02-05T01:03:29.507+00:00,20,...,1792076088,URN:catalog:CLO:EBIRD:OBS530014911,ECU,Ecuador,ECU.24_1,Zamora Chinchipe,ECU.24.9_1,Zamora,ECU.24.9.7_1,Zamora
1,3580016382,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,e2e717bf-551a-4917-bdc9-4fa0f342c530,7182d304-b0a2-404b-baba-2086a325c221,e2e717bf-551a-4917-bdc9-4fa0f342c530,EC,DWC_ARCHIVE,2024-09-27T13:35:39.907+00:00,2025-02-05T05:16:27.384+00:00,20,...,3580016382,URN:catalog:CLO:EBIRD:OBS1291231701,ECU,Ecuador,ECU.24_1,Zamora Chinchipe,ECU.24.8_1,Yantzaza,ECU.24.8.3_1,Yantzaza (Yanzatza)
2,1725223274,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,e2e717bf-551a-4917-bdc9-4fa0f342c530,7182d304-b0a2-404b-baba-2086a325c221,e2e717bf-551a-4917-bdc9-4fa0f342c530,PE,DWC_ARCHIVE,2024-09-27T13:35:39.907+00:00,2025-02-05T04:01:36.067+00:00,20,...,1725223274,URN:catalog:CLO:EBIRD:OBS388078755,PER,Peru,PER.1_1,Amazonas,PER.1.3_1,Chachapoyas,PER.1.3.12_1,Magdalena
3,4355556963,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,e2e717bf-551a-4917-bdc9-4fa0f342c530,7182d304-b0a2-404b-baba-2086a325c221,e2e717bf-551a-4917-bdc9-4fa0f342c530,PE,DWC_ARCHIVE,2024-09-27T13:35:39.907+00:00,2025-02-05T05:20:08.282+00:00,20,...,4355556963,URN:catalog:CLO:EBIRD:OBS1339820584,PER,Peru,PER.23_1,San Martín,PER.23.8_1,Rioja,PER.23.8.4_1,Pardo Miguel
4,3560221744,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,e2e717bf-551a-4917-bdc9-4fa0f342c530,7182d304-b0a2-404b-baba-2086a325c221,e2e717bf-551a-4917-bdc9-4fa0f342c530,EC,DWC_ARCHIVE,2024-09-27T13:35:39.907+00:00,2025-02-05T02:28:36.392+00:00,20,...,3560221744,URN:catalog:CLO:EBIRD:OBS1295620793,ECU,Ecuador,ECU.7_1,El Oro,ECU.7.11_1,Piñas,ECU.7.11.3_1,Moromoro (Cab. En El Vado)


In [14]:
sdf.columns

Index(['key', 'datasetKey', 'publishingOrgKey', 'installationKey',
       'hostingOrganizationKey', 'publishingCountry', 'protocol',
       'lastCrawled', 'lastParsed', 'crawlId', 'basisOfRecord',
       'individualCount', 'occurrenceStatus', 'taxonKey', 'kingdomKey',
       'phylumKey', 'classKey', 'orderKey', 'familyKey', 'genusKey',
       'speciesKey', 'acceptedTaxonKey', 'scientificName',
       'acceptedScientificName', 'kingdom', 'phylum', 'order', 'family',
       'genus', 'species', 'genericName', 'specificEpithet', 'taxonRank',
       'taxonomicStatus', 'iucnRedListCategory', 'decimalLatitude',
       'decimalLongitude', 'continent', 'stateProvince', 'year', 'month',
       'day', 'eventDate', 'startDayOfYear', 'endDayOfYear', 'issues',
       'lastInterpreted', 'license', 'isSequenced', 'identifiers', 'media',
       'facts', 'relations', 'isInCluster', 'recordedBy', 'geodeticDatum',
       'class', 'countryCode', 'recordedByIDs', 'identifiedByIDs',
       'gbifRegion', 'cou

In [15]:
# previously we wrote this request by hand
urlpath = "http://api.gbif.org/v1/occurrence/search?q=Bombus"

In [16]:
requests.get(urlpath).text
#typing in by hand

'{"offset":0,"limit":20,"endOfRecords":false,"count":3928343,"results":[{"key":1792076088,"datasetKey":"4fa7b334-ce0d-4e88-aaae-2e0c138d049e","publishingOrgKey":"e2e717bf-551a-4917-bdc9-4fa0f342c530","installationKey":"7182d304-b0a2-404b-baba-2086a325c221","hostingOrganizationKey":"e2e717bf-551a-4917-bdc9-4fa0f342c530","publishingCountry":"EC","protocol":"DWC_ARCHIVE","lastCrawled":"2024-09-27T13:35:39.907+00:00","lastParsed":"2025-02-05T01:03:29.507+00:00","crawlId":20,"extensions":{},"basisOfRecord":"HUMAN_OBSERVATION","individualCount":1,"occurrenceStatus":"PRESENT","taxonKey":5228583,"kingdomKey":1,"phylumKey":44,"classKey":212,"orderKey":1448,"familyKey":5289,"genusKey":2476913,"speciesKey":5228583,"acceptedTaxonKey":5228583,"scientificName":"Chaetocercus bombus Gould, 1871","acceptedScientificName":"Chaetocercus bombus Gould, 1871","kingdom":"Animalia","phylum":"Chordata","order":"Apodiformes","family":"Trochilidae","genus":"Chaetocercus","species":"Chaetocercus bombus","genericN

In [17]:
#That was not very nice
#format parameters as a dictionary
# here we create the same urlpath using params
response = requests.get(
    url="https://api.gbif.org/v1/occurrence/search/",
    params={"q": "Bombus"}
)

# show url path
print(response.url)


https://api.gbif.org/v1/occurrence/search/?q=Bombus


In [18]:
res = response.json()
sdf = pd.json_normalize(dict['results'])


TypeError: 'type' object is not subscriptable

In [19]:
# get taxonomy info for the genus Bombus
#queries only where bombus is specifically in genus
res = requests.get(
    url="https://api.gbif.org/v1/species/match/",
    params={"genus": "Bombus"},
)
res.json()
#information getting back is one hit - can use genus key value for next search to declutter results
#genus key is 1340278

{'usageKey': 1340278,
 'scientificName': 'Bombus Latreille, 1802',
 'canonicalName': 'Bombus',
 'rank': 'GENUS',
 'status': 'ACCEPTED',
 'confidence': 94,
 'matchType': 'EXACT',
 'kingdom': 'Animalia',
 'phylum': 'Arthropoda',
 'order': 'Hymenoptera',
 'family': 'Apidae',
 'genus': 'Bombus',
 'kingdomKey': 1,
 'phylumKey': 54,
 'classKey': 216,
 'orderKey': 1457,
 'familyKey': 4334,
 'genusKey': 1340278,
 'synonym': False,
 'class': 'Insecta'}

In [20]:
# get taxonomy info for the genus Pedicularis
res = requests.get(
    url="https://api.gbif.org/v1/species/match/",
    params={"genus": "Pedicularis"},
)
res.json()

{'usageKey': 3171670,
 'scientificName': 'Pedicularis L.',
 'canonicalName': 'Pedicularis',
 'rank': 'GENUS',
 'status': 'ACCEPTED',
 'confidence': 95,
 'matchType': 'EXACT',
 'kingdom': 'Plantae',
 'phylum': 'Tracheophyta',
 'order': 'Lamiales',
 'family': 'Orobanchaceae',
 'genus': 'Pedicularis',
 'kingdomKey': 6,
 'phylumKey': 7707728,
 'classKey': 220,
 'orderKey': 408,
 'familyKey': 6651,
 'genusKey': 3171670,
 'synonym': False,
 'class': 'Magnoliopsida'}

In [21]:
# add requirement that the record have coordinate data
res = requests.get(
    url="https://api.gbif.org/v1/occurrence/search/",
    params={
        "genusKey": 1340278, 
        "hasCoordinate": "true",
    }
)
res.url
#hasCoordinate is location data

'https://api.gbif.org/v1/occurrence/search/?genusKey=1340278&hasCoordinate=true'

In [22]:
# request records 0-100
res = requests.get(
    url="https://api.gbif.org/v1/occurrence/search/",
    params={
        "genusKey": 1340278, 
        "hasCoordinate": "true",
        "offset": 100,
        "limit": 2,
    }
)
res.url
res.json()
#by default gives limit 20 - in get call can change offset and limit
#offset 100, limit 2: gives 101 and 102 records
#limit max is 300 - can write a for loop to iterate to get all data
#limits on amount per request to not abuse database and put strain on requesting terrabytes of data

{'offset': 100,
 'limit': 2,
 'endOfRecords': False,
 'count': 3521306,
 'results': [{'key': 5007561247,
   'datasetKey': '50c9509d-22c7-4a22-a47d-8c48425ef4a7',
   'publishingOrgKey': '28eb1a3f-1c15-4a95-931a-4af90ecb574d',
   'installationKey': '997448a8-f762-11e1-a439-00145eb45e9a',
   'hostingOrganizationKey': '28eb1a3f-1c15-4a95-931a-4af90ecb574d',
   'publishingCountry': 'US',
   'protocol': 'DWC_ARCHIVE',
   'lastCrawled': '2025-02-16T04:52:36.717+00:00',
   'lastParsed': '2025-02-16T15:54:27.861+00:00',
   'crawlId': 518,
   'projectId': 'https://www.inaturalist.org/projects/texas-invertebrate-species-of-conservation-need',
   'extensions': {'http://rs.gbif.org/terms/1.0/Multimedia': [{'http://purl.org/dc/terms/created': '2025-01-04T23:12:41Z',
      'http://purl.org/dc/terms/type': 'StillImage',
      'http://purl.org/dc/terms/creator': 'juliaschaubert',
      'http://purl.org/dc/terms/rightsHolder': 'juliaschaubert',
      'http://purl.org/dc/terms/publisher': 'iNaturalist',


In [23]:
res = requests.get(
    url="https://api.gbif.org/v1/occurrence/search/",
    params={
        "genusKey": 1340278, 
        "year": "1900,1910", 
        #dates of records
        "basisOfRecord": "PRESERVED_SPECIMEN",
        #physical specimen
        "hasCoordinate": "true",
        #has location dara
        "hasGeospatialIssue": "false",
        "country": "US",
        #records in US
    },
)

print(res.json()["count"])
#still 7373 hits in database even though specific

7373


In [24]:
def get_single_batch(genusKey, year, offset=0, limit=20):
    """
    Returns a GBIF REST query with records between offset
    and offset + limit in JSON format. The genusKey and 
    year interval can be changed.
    """
    res = requests.get(
        url="https://api.gbif.org/v1/occurrence/search/",
        params={
            "genusKey": genusKey,
            "year": year,
            "offset": offset,
            "limit": limit,
            "hasCoordinate": "true",
            "country": "US",
        }
    )
    return res.json()

In [25]:
# test single batch function
jdata = get_single_batch(
    genusKey=3171670,
    year="1990,2020",
    offset=0, 
    limit=20
)

# how many results were fetched?
print(len(jdata["results"]))

20


In [26]:
# did we reach the end of the records?
jdata["endOfRecords"]

False

In [27]:
def get_all_records(genusKey, year):
    """
    Iterate requests over incremental offset positions until
    all records have been fetched. When the last record has
    been fetched the key 'endOfRecords' will be 'true'. Takes
    the API params as a dictionary. Returns result as a list
    of dictionaries.
    """
    # for storing results
    alldata = []

    # continue until we call 'break'
    offset = 0
    while 1:

        # get JSON data for a batch 
        jdata = get_single_batch(genusKey, year, offset, 300)

        # increment counter by 300 (the max limit)
        offset += 300

        # add this batch of data to the growing list
        alldata.extend(jdata["results"])

        # stop when end of record is reached
        if jdata["endOfRecords"]:
            print(f'Done. Found {len(alldata)} records')
            break

        # print a dot on each rep to show progress
        print('.', end='')

    return alldata

In [28]:
# call function to search over all offset values until end. 
# THIS MAY TAKE A FEW MINUTES TO RUN
jdata = get_all_records(1340278, "1900,1902")

....Done. Found 1223 records


In [29]:
# convert to a data frame
df = pd.json_normalize(jdata)

In [30]:
# keys (columns) in the dataframe (there are many!)
list(df.columns)

['key',
 'datasetKey',
 'publishingOrgKey',
 'networkKeys',
 'installationKey',
 'hostingOrganizationKey',
 'publishingCountry',
 'protocol',
 'lastCrawled',
 'lastParsed',
 'crawlId',
 'basisOfRecord',
 'individualCount',
 'occurrenceStatus',
 'lifeStage',
 'taxonKey',
 'kingdomKey',
 'phylumKey',
 'classKey',
 'orderKey',
 'familyKey',
 'genusKey',
 'speciesKey',
 'acceptedTaxonKey',
 'scientificName',
 'acceptedScientificName',
 'kingdom',
 'phylum',
 'order',
 'family',
 'genus',
 'species',
 'genericName',
 'specificEpithet',
 'taxonRank',
 'taxonomicStatus',
 'iucnRedListCategory',
 'decimalLatitude',
 'decimalLongitude',
 'continent',
 'stateProvince',
 'higherGeography',
 'year',
 'month',
 'day',
 'eventDate',
 'startDayOfYear',
 'endDayOfYear',
 'issues',
 'modified',
 'lastInterpreted',
 'license',
 'isSequenced',
 'identifiers',
 'media',
 'facts',
 'relations',
 'institutionKey',
 'collectionKey',
 'isInCluster',
 'datasetName',
 'recordedBy',
 'preparations',
 'geodeticDa

In [31]:
# view just the columns we're interested in for now.
sdf = df[["species", "year", "decimalLatitude", "decimalLongitude"]]
sdf.head()

Unnamed: 0,species,year,decimalLatitude,decimalLongitude
0,Bombus pensylvanicus,1902,39.98,-82.98
1,Bombus vosnesenskii,1902,38.18741,-122.5208
2,Bombus vosnesenskii,1902,37.3996,-121.7997
3,Bombus rufocinctus,1902,37.397444,-121.802593
4,Bombus californicus,1902,38.50222,-122.26528


In [32]:
# how many records?
sdf.shape

(1223, 4)

In [33]:
# which unique species?
print(sdf.species.unique())

['Bombus pensylvanicus' 'Bombus vosnesenskii' 'Bombus rufocinctus'
 'Bombus californicus' 'Bombus melanopygus' 'Bombus impatiens'
 'Bombus centralis' 'Bombus appositus' 'Bombus occidentalis'
 'Bombus terricola' 'Bombus griseocollis' 'Bombus vagans' 'Bombus huntii'
 'Bombus fervidus' 'Bombus insularis' 'Bombus bifarius'
 'Bombus bimaculatus' 'Bombus auricomus' 'Bombus perplexus'
 'Bombus variabilis' 'Bombus affinis' 'Bombus cockerelli'
 'Bombus citrinus' 'Bombus nevadensis' 'Bombus morrisoni' nan
 'Bombus sonorus' 'Bombus ternarius' 'Bombus sitkensis' 'Bombus neoboreus'
 'Bombus mixtus' 'Bombus suckleyi' 'Bombus ashtoni' 'Bombus fraternus'
 'Bombus frigidus' 'Bombus flavifrons' 'Bombus fernaldae'
 'Bombus sylvicola' 'Bombus patagiatus' 'Bombus lapponicus'
 'Bombus balteatus' 'Bombus borealis' 'Bombus crotchii']


In [34]:
# plot the number of each species in order (hover over bars for names)
sp_counts = df.species.value_counts()
toyplot.bars(sp_counts, height=350, title=sp_counts.index);

In [None]:
#functions from above
def get_single_batch(genusKey, year, offset=0, limit=20):
    """
    Returns a GBIF REST query with records between offset
    and offset + limit in JSON format. The genusKey and 
    year interval can be changed.
    """
    res = requests.get(
        url="https://api.gbif.org/v1/occurrence/search/",
        params={
            "genusKey": genusKey,
            "year": year,
            "offset": offset,
            "limit": limit,
            "hasCoordinate": "true",
            "country": "US",
        }
    )
    return res.json()

def get_all_records(genusKey, year):
    """
    Iterate requests over incremental offset positions until
    all records have been fetched. When the last record has
    been fetched the key 'endOfRecords' will be 'true'. Takes
    the API params as a dictionary. Returns result as a list
    of dictionaries.
    """
    # for storing results
    alldata = []

    # continue until we call 'break'
    offset = 0
    while 1:

        # get JSON data for a batch 
        jdata = get_single_batch(genusKey, year, offset, 300)

        # increment counter by 300 (the max limit)
        offset += 300

        # add this batch of data to the growing list
        alldata.extend(jdata["results"])

        # stop when end of record is reached
        if jdata["endOfRecords"]:
            print(f'Done. Found {len(alldata)} records')
            break

        # print a dot on each rep to show progress
        print('.', end='')

    return alldata

In [43]:
# my class Records

class Records:
    def __init__(self, genusKey=None, year=None):

        # store input params
        self.genusKey = genusKey
        self.year = year

        # will be used to store output results
        self.df = []
        self.json = []

    def get_single_batch(self, offset=0, limit=20):
        "returns JSON result for a small batch query"
        res = requests.get(
        url="https://api.gbif.org/v1/occurrence/search/",
        params={
            "genusKey": self.genusKey,
            "year": self.year,
            "offset": offset,
            "limit": limit,
            "hasCoordinate": "true",
            "country": "US",
            }
        )
        return res.json()

    def get_all_records(self):
        "stores result for all records to self.json and self.df"

        # continue until we call 'break'
        offset = 0
        while 1:

            # get JSON data for a batch 
            jdata = get_single_batch(self.genusKey, self.year, offset, 300)

            # increment counter by 300 (the max limit)
            offset += 300

            # add this batch of data to the growing list
            self.json.extend(jdata["results"])

            # stop when end of record is reached
            if jdata["endOfRecords"]:
                print(f'Done. Found {len(self.json)} records')
                break

            # print a dot on each rep to show progress
            print('.', end='')
            
        self.df = pd.json_normalize(self.json)

        return self.json
        return self.df
    


In [44]:
#Testing class Records
# create instance by entering query and a range of years as integers
rec = Records(genusKey=1340278, year="1980,1985")


In [45]:
# show a small result
print(rec.get_single_batch(offset=0, limit=10))

{'offset': 0, 'limit': 10, 'endOfRecords': False, 'count': 13590, 'results': [{'key': 658682722, 'datasetKey': '10e44c48-0839-4a20-86d5-f0e23ae2e366', 'publishingOrgKey': '1e26a630-7203-11dc-a0d8-b8a03c50a862', 'installationKey': '394c0f8e-8b5e-4fbc-b942-568a4a267032', 'hostingOrganizationKey': 'c3ad790a-d426-4ac1-8e32-da61f81f0117', 'publishingCountry': 'US', 'protocol': 'EML', 'lastCrawled': '2025-01-18T02:06:33.991+00:00', 'lastParsed': '2025-01-31T22:08:48.876+00:00', 'crawlId': 267, 'extensions': {}, 'basisOfRecord': 'PRESERVED_SPECIMEN', 'occurrenceStatus': 'PRESENT', 'taxonKey': 1340518, 'kingdomKey': 1, 'phylumKey': 54, 'classKey': 216, 'orderKey': 1457, 'familyKey': 4334, 'genusKey': 1340278, 'speciesKey': 1340518, 'acceptedTaxonKey': 1340518, 'scientificName': 'Bombus melanopygus Nylander, 1848', 'acceptedScientificName': 'Bombus melanopygus Nylander, 1848', 'kingdom': 'Animalia', 'phylum': 'Arthropoda', 'order': 'Hymenoptera', 'family': 'Apidae', 'genus': 'Bombus', 'species'

In [46]:
# get all records
rec.get_all_records()

.............................................Done. Found 13590 records


[{'key': 658682722,
  'datasetKey': '10e44c48-0839-4a20-86d5-f0e23ae2e366',
  'publishingOrgKey': '1e26a630-7203-11dc-a0d8-b8a03c50a862',
  'installationKey': '394c0f8e-8b5e-4fbc-b942-568a4a267032',
  'hostingOrganizationKey': 'c3ad790a-d426-4ac1-8e32-da61f81f0117',
  'publishingCountry': 'US',
  'protocol': 'EML',
  'lastCrawled': '2025-01-18T02:06:33.991+00:00',
  'lastParsed': '2025-01-31T22:08:48.876+00:00',
  'crawlId': 267,
  'extensions': {},
  'basisOfRecord': 'PRESERVED_SPECIMEN',
  'occurrenceStatus': 'PRESENT',
  'taxonKey': 1340518,
  'kingdomKey': 1,
  'phylumKey': 54,
  'classKey': 216,
  'orderKey': 1457,
  'familyKey': 4334,
  'genusKey': 1340278,
  'speciesKey': 1340518,
  'acceptedTaxonKey': 1340518,
  'scientificName': 'Bombus melanopygus Nylander, 1848',
  'acceptedScientificName': 'Bombus melanopygus Nylander, 1848',
  'kingdom': 'Animalia',
  'phylum': 'Arthropoda',
  'order': 'Hymenoptera',
  'family': 'Apidae',
  'genus': 'Bombus',
  'species': 'Bombus melanopyg

In [47]:
# access all of the returned records as a dataframe 
# (here asking for the shape to see how many records there are)
rec.df.shape

(13590, 172)

In [48]:
import records

ModuleNotFoundError: No module named 'records'

In [33]:
#Tutorial 12.1
vcf_url = "https://raw.githubusercontent.com/isaacovercast/easySFS/refs/heads/master/example_files/wcs_1200.vcf"

In [34]:
vcf_response = requests.get(vcf_url)

In [37]:
vcf_response.status_code

200

In [38]:
vcf_response.text[:500]

'##fileformat=VCFv4.0\n##Tassel=<ID=GenotypeTable,Version=5,Description="Reference allele is not known. The major allele was used as reference allele">\n##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the reference and alternate alleles in the order listed">\n##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">\n##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype '

In [42]:
pd.read_csv(vcf_response)

ValueError: Invalid file path or buffer object type: <class 'requests.models.Response'>

In [48]:
with open("wcs.vcf", 'w') as outfile:
    outfile.write(res.text)
lines = open('wcs.vcf').readlines()
for idx, line in enumerate(lines):
    if "CHROM" in line:
        print(idx)
#knew CHROM was header by opening url and visualizing vcf

In [49]:
pd.read_csv(vcf_response, header=10, sep ="/t")

ValueError: Invalid file path or buffer object type: <class 'requests.models.Response'>

In [None]:
missing = (df)

In [None]:
#Tutorial 13.0

In [1]:
#This function is supposed to return a list of all values less than a given integer value. 
#The function optionally takes a max_value, and it requires to pass in a list of numbers. 
#Your goal is to get this function to work correctly. 
#This code block has two errors, which you can solve consecutively.

In [2]:
def get_smaller(my_list, max_value='5'):
    """Return a list of all values smaller than max_value"""
    for element in my_list:
        low = []
        if element < max_value:
            low.append(element)
    return low

my_list = [5, 2, 12, 7, 3, 8]
get_smaller(my_list=my_list)

TypeError: '<' not supported between instances of 'int' and 'str'

In [5]:
#removed quotes around 5 str  -> int
#low[] needed to be called before iteration over list
def get_smaller(my_list, max_value= 5):
    """Return a list of all values smaller than max_value"""
    low = []
    for element in my_list:
        if element < max_value:
            low.append(element)
    return low

my_list = [5, 2, 12, 7, 3, 8]
get_smaller(my_list=my_list)

[2, 3]

In [7]:
def fizzbuzz(max_num):
    "This method implements FizzBuzz"
    three_mul = 'fizz'
    five_mul = 'buzz'
    num1 = 3
    num2 = 5 

    # Google for 'range in python' to see what it does
    for i in range(1,max_num):
        # % or modulo division gives you the remainder 
        if i%num1==0 and i%num2==0:
            print(i,three_mul+five_mul)
        elif i%num1=0:
            print(i,three_mul)
        elif i%num2==0:
            print(i,five_mul)
fizzbuzz()

TypeError: fizzbuzz() missing 1 required positional argument: 'max_num'

In [11]:
#needed two == for first elif
#needed to set max_num

def fizzbuzz(max_num):
    "This method implements FizzBuzz"
    three_mul = 'fizz'
    five_mul = 'buzz'
    num1 = 3
    num2 = 5 

    # Google for 'range in python' to see what it does
    for i in range(1,max_num):
        # % or modulo division gives you the remainder 
        if i%num1==0 and i%num2==0:
            print(i,three_mul+five_mul)
        elif i%num1==0:
            print(i,three_mul)
        elif i%num2==0:
            print(i,five_mul)
fizzbuzz(max_num=16)

3 fizz
5 buzz
6 fizz
9 fizz
10 buzz
12 fizz
15 fizzbuzz


In [15]:
#This function takes one argument, numbers, which should be a list of numbers. 
#The function should sum all positive numbers in this list. Why doesn't it work?
def sum_positive(numbers):
    total = 0
    for num in numbers:
        if num > 0:
            total + num
    return total

sum_positive([1,2,3,4,5,6,7])

0

In [16]:
# += needed to add
# %2 ==0 for only positive numbers
def sum_positive(numbers):
    total = 0
    for num in numbers:
        if num%2 == 0:
            total += num
    return total

sum_positive([1,2,3,4,5,6,7])

12