# German Mammals GBIF

Using the Data from GBIF to create the list of German Mammals

In [7]:
from pygbif import species as species
from pygbif import occurrences as occ 
import zipfile
# we refer to the occurrence module as occ

import os
import pandas as pd


Finding the keys for the different taxon levels can be a little tricky if you do not know where or how to look.

For example here we want to find mammals in Germany. We can check the documentation of occ.search() on by following this link: https://pygbif.readthedocs.io/en/latest/docs/usecases.html

There we find that our class key is an integer "classKey – [int] Class classification key". But we do not have a list of classes and there corrisponding integers. Which is confusing (if this exists please send me the link). If you are like me you will try entering a string like 'mammalia' anyway only to get a trackback. So what you can do instead is to the URL for the taxon level of interest and pull the key from there for mammals that is this https://www.gbif.org/species/359. the classKey = 359. 

occ.search() allows us to specify several parameters. Including country, here the documentation is fairly straight forward and is as follows: 'country – [str] The 2-letter country code (as per ISO-3166-1) of the country in which the occurrence was recorded. See here http://en.wikipedia.org/wiki/ISO_3166-1_alpha-2´ 

We can visit the link and find that the two letter string for Germany is 'DE', or that Samoa is 'WS'. 

There are several other arguments that are imporant
1. limit-which gives the number of returned records the default is 300 and the limit is 1000.
2. offset-which indicates where to start from
3. q - allows search with a word or phrase

Alright now lets try it by looking at the first 10 mammal records.

In [8]:
occ.search(classKey=359,country='DE', limit=10)

{'offset': 0,
 'limit': 10,
 'endOfRecords': False,
 'count': 835559,
 'results': [{'key': 5004507484,
   'datasetKey': 'aa6c5ee6-d4d7-4a65-a04f-379cffbf4842',
   'publishingOrgKey': '2754e9c0-0e43-4f65-968a-6f16b9c378ce',
   'installationKey': 'dcceb601-2fb0-49dc-9cd2-7c00056f2b2c',
   'hostingOrganizationKey': '2754e9c0-0e43-4f65-968a-6f16b9c378ce',
   'publishingCountry': 'DE',
   'protocol': 'BIOCASE',
   'lastCrawled': '2025-11-17T11:37:19.101+00:00',
   'lastParsed': '2025-11-17T11:52:41.317+00:00',
   'crawlId': 368,
   'extensions': {},
   'basisOfRecord': 'HUMAN_OBSERVATION',
   'occurrenceStatus': 'PRESENT',
   'classifications': {'7ddf754f-d193-4cc9-b351-99906754a03b': {'usage': {'key': 'RQPW',
      'name': 'Castor fiber Linnaeus, 1758',
      'rank': 'SPECIES',
      'code': 'ZOOLOGICAL',
      'authorship': 'Linnaeus, 1758',
      'genericName': 'Castor',
      'specificEpithet': 'fiber',
      'formattedName': '<i>Castor</i> <i>fiber</i> Linnaeus, 1758'},
     'acceptedU

The above output is not particularly readable, nor is it in the table format I would like for a list of species, we also have no idea how many records exist. 

To find the number of records lets use occ.count() however, here there is not argument classKey instead we use taxonKey

In [9]:
occ.count(taxonKey=359,country='DE')

835559

76,703 records is a lot. 

In [12]:
occ.count(taxonKey=359,country='DE', isGeoreferenced=True)

757967

We can see there are fewer records if we specify that we need the recored to be georeferanced.

Lets see if we can download the data and then simplify the output.

An interesting quirk of the occ.download() method is that filters need to be passed as parameters. Using either 

In [None]:
# Set GBIF credentials
os.environ["GBIF_USER"] = "your_gbif_username"
os.environ["GBIF_PWD"] = "your_gbif_password"
os.environ["GBIF_EMAIL"] = "your_gbif_email"

# Create download
download_key = occ.download(
    [
        'taxonKey = 359',
        'country = DE',
        'hasCoordinate = true'
    ],
    format="DWCA",  # or "SIMPLE_CSV", "SPECIES_LIST"
    user=os.environ["GBIF_USER"],
    pwd=os.environ["GBIF_PWD"],
    email=os.environ["GBIF_EMAIL"]
)



In [None]:
# Information about the download
print(download_key)
download_key = download_key[0]
print(download_key)
occ.download_meta(download_key)

In [None]:
# This just tracks the downloads progress
while True:
    meta = occ.download_meta(download_key)
    status = meta["status"]
    print(f"Current GBIF status: {status}")

    if status in ["SUCCEEDED", "FAILED", "CANCELLED", "KILLED"]:
        break

    time.sleep(60)  # check every 1 minute

if status == "SUCCEEDED":
    print("\n✅ GBIF download is finished!\n")
    print("\a")  # terminal bell / sound
else:
    print(f"\n⚠️ GBIF download ended with status: {status}\n")
    print("\a")  # still ping you

835,559 records is a lot. 

Because our request succeeded: 'status': 'SUCCEEDED' we can use the occ.download_get() to download the zipfile

In [None]:
# set the path to the path where you want to store the data
download = occ.download_get(
    download_key,
    path="outputs"
)

The download worked so now we have a zipfile

In [None]:
# path with the zip file
print(download)       # see the actual path, e.g. "outputs/00307....zip"
print(type(download)) # usually <class 'str'>
zip_path = download["path"]
print(zip_path)


In [None]:
# Extract data
outpath = "outputs/gbif_data"
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(outpath)

print(f"Extracted to directory {outpath}")


In [None]:
# Read the occurrence data
df = pd.read_csv("gbif_data/occurrence.txt", delimiter="\t")
df.head()



check the documentation on this page to find the column names and the meaning
https://dwc.tdwg.org/terms/

In [None]:
selectedColumns = [
    'order',
    'family',
    'genus',
    'specificEpithet',
    'scientificName',
    'vernacularName',
    'decimalLatitude',
    'decimalLongitude'
]
dfv1 = df[selectedColumns]
dfv1.head()

In [None]:
# keep only the rows with complete species names
dfv1 = dfv1[dfv1['genus'].notnull()]
dfv1 = dfv1[dfv1['specificEpithet'].notnull()]
dfv1["species"] = dfv1["genus"]+ (" "+dfv1["specificEpithet"].fillna(""))

dfv1.head()

The above is super nice but we only want the unique species 

In [None]:
# Collapse vernacular names per species
species_list = (
    dfv1.groupby(['order', 'family','scientificName'], as_index=False)
      .agg({'vernacularName': lambda x: ', '.join(sorted(set(x.dropna())))})
)
species_list.head()

#rowCount =  len(species_list)
#print(f"Number of species: {rowCount}")

Unnamed: 0,order,family,scientificName,vernacularName
0,Afrosoricida,Tenrecidae,"Echinops telfairi Martin, 1838",
1,Afrosoricida,Tenrecidae,"Tenrec ecaudatus (Schreber, 1778)",
2,Artiodactyla,Anoplotheriidae,"Anoplotherium Cuvier, 1804",
3,Artiodactyla,Anoplotheriidae,"Dacrytherium ovinum (Owen, 1857)",
4,Artiodactyla,Anoplotheriidae,"Diplobune Rütimeyer, 1862",


In [None]:
species_only = species_list['species'].drop_duplicates().sort_values()
species_only.head()

species_count = len(species_only)
print(f"There are {species_count} in the list of mammals from germany")

In [None]:
species_list.to_csv("MammalsOfGermany.csv", index=False)