# Occurrence data for Invertebrate Paleo Group Project, Spring 2021, AMNH RGGS

## Load libraries.

Some of this stuff is vestigial tails from other projects
(for example, matplotlib isn't necessary but I'm leaving it in here out of habit).

In [37]:
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline

# PBDB Queries

Load in PBDB occurrences from Asia and North America, from 520-500mya. These URLs are from the "Download Records"
page on paleobiodb.org.

The CSV files are cached in the filenames specified below. You can force downloading by either deleting the CSV files
or by setting FORCE_DOWNLOAD below to True.

In [38]:
FORCE_DOWNLOAD = False

asia_520_500_pbdb_url = (
    "http://paleobiodb.org/data1.2/occs/list.csv?datainfo&rowcount&max_ma=520&min_ma=500&cc=ASI&"
    "lithology=siliciclastic,mixed,carbonate,evaporite,organic,chemical,volcanic,metasedimentary,"
    "metamorphic,other,unknown&envtype=terr,marine,carbonate,silicic,unknown,lacust,fluvial,karst,"
    "terrother,marginal,reef,stshallow,stdeep,offshore,slope,marindet&show=full,genus,subgenus,strat,env,ref"
)
asia_csv_filename = "pbdb_asia_raw_520_500.csv"
    
na_520_500_pbdb_url = (
    "http://paleobiodb.org/data1.2/occs/list.csv?datainfo&rowcount&max_ma=520&min_ma=500&cc=NOA&"
    "lithology=siliciclastic,mixed,carbonate,evaporite,organic,chemical,volcanic,metasedimentary,"
    "metamorphic,other,unknown&envtype=terr,marine,carbonate,silicic,unknown,lacust,fluvial,karst,"
    "terrother,marginal,reef,stshallow,stdeep,offshore,slope,marindet&show=full,genus,subgenus,strat,env,ref"
)
na_csv_filename = "pbdb_na_raw_520_500.csv"

if (not os.path.exists(asia_csv_filename)) or FORCE_DOWNLOAD == True:
    r = requests.get(asia_520_500_pbdb_url, allow_redirects=True)
    open(asia_csv_filename, 'wb').write(r.content)

if (not os.path.exists(na_csv_filename)) or FORCE_DOWNLOAD == True:
    r = requests.get(na_520_500_pbdb_url, allow_redirects=True)
    open(na_csv_filename, 'wb').write(r.content)

# Kaili.
pbdb_asia_occs_df = pd.read_csv(asia_csv_filename, 
                                low_memory=False, 
                                encoding='latin1',
                                skiprows=21)
kaili_occs_df = pbdb_asia_occs_df[pbdb_asia_occs_df.formation == 'Kaili']

# Poleta.
pbdb_na_occs_df = pd.read_csv(na_csv_filename, 
                              low_memory=False,
                              encoding='latin1',
                              skiprows=21)
poleta_occs_df = pbdb_na_occs_df[pbdb_na_occs_df.formation == 'Poleta']

# Burgess Shale.
burgess_occs_df = pbdb_na_occs_df[pbdb_na_occs_df.formation == 'Burgess Shale']

There are lots of columns. Like, *lots*. We will not use most of these, but here are some of them.

In [39]:
kaili_occs_df.columns

Index(['occurrence_no', 'record_type', 'reid_no', 'flags', 'collection_no',
       'identified_name', 'identified_rank', 'identified_no', 'difference',
       'accepted_name',
       ...
       'ecospace_comments', 'composition', 'architecture', 'thickness',
       'reinforcement', 'genus.1', 'formation.1', 'stratgroup.1', 'member.1',
       'primary_reference'],
      dtype='object', length=123)

Now we want to get a list of just each taxon, which happens to be conveniently put in the "identified_name" column. This basically collapses all occurrences into a single taxon.

In [40]:
kaili_id_name_set = set()
for id_name in kaili_occs_df['identified_name']:
    kaili_id_name_set.add(id_name)
burgess_id_name_set = set()
for id_name in burgess_occs_df['identified_name']:
    burgess_id_name_set.add(id_name)

poleta_id_name_set = set()
for id_name in poleta_occs_df['identified_name']:
    poleta_id_name_set.add(id_name)

# Print how many in each.
print(f'Kaili taxa: {len(kaili_id_name_set)}')
print(f'Poleta taxa: {len(poleta_id_name_set)}')
print(f'Burgess Shale taxa: {len(burgess_id_name_set)}')

Kaili taxa: 130
Poleta taxa: 45
Burgess Shale taxa: 279


Now let's dump the taxonomic info for every one of these into a CSV file.

In [50]:
with open("kaili_taxa.csv", "w") as outfile:
    print("phylum,class,order,family,genus,identified_name", file=outfile)
    for taxon in kaili_id_name_set:
        phylum = kaili_occs_df[kaili_occs_df['identified_name'] == taxon].iloc[0]['phylum']
        tax_class = kaili_occs_df[kaili_occs_df['identified_name'] == taxon].iloc[0]['class']
        order = kaili_occs_df[kaili_occs_df['identified_name'] == taxon].iloc[0]['order']
        family = kaili_occs_df[kaili_occs_df['identified_name'] == taxon].iloc[0]['family']
        genus = kaili_occs_df[kaili_occs_df['identified_name'] == taxon].iloc[0]['genus']

        print(f'{phylum}, {tax_class}, {order}, {family}, {genus}, {taxon}', file=outfile)

In [10]:
with open("kaili_sp.csv", "w") as outfile:
    print("class, order, genus, species", file=outfile)
    for gs in kaili_genus_species_dict.keys():
        row_num = kaili_genus_species_dict[gs][0]
        cl = kaili_occs_df.loc[row_num]['class_name']
        order = kaili_occs_df.loc[row_num]['order_name']
        genus = kaili_occs_df.loc[row_num]['occurrence.genus_name']
        species = kaili_occs_df.loc[row_num]['occurrence.species_name']
        print(f'{cl}, {order}, {genus}, {species}', file=outfile)

Poleta:

In [11]:
poleta_genus_species_dict = defaultdict(list)
for i, row in poleta_occs_df.iterrows():
    genus =   row['occurrence.genus_name']
    species = row['occurrence.species_name']
    genus_species_binomial = f'{genus} {species}'

    poleta_genus_species_dict[genus_species_binomial].append(i)

with open("poleta_sp.csv", "w") as outfile:
    print("class, order, genus, species", file=outfile)
    for gs in poleta_genus_species_dict.keys():
        row_num = poleta_genus_species_dict[gs][0]
        cl = poleta_occs_df.loc[row_num]['class_name']
        order = poleta_occs_df.loc[row_num]['order_name']
        genus = poleta_occs_df.loc[row_num]['occurrence.genus_name']
        species = poleta_occs_df.loc[row_num]['occurrence.species_name']
        print(f'{cl}, {order}, {genus}, {species}', file=outfile)


Burgess shale:

In [12]:
burgess_genus_species_dict = defaultdict(list)
for i, row in burgess_occs_df.iterrows():
    genus =   row['occurrence.genus_name']
    species = row['occurrence.species_name']
    genus_species_binomial = f'{genus} {species}'

    burgess_genus_species_dict[genus_species_binomial].append(i)

with open("burgess_sp.csv", "w") as outfile:
    print("class, order, genus, species", file=outfile)
    for gs in burgess_genus_species_dict.keys():
        row_num = burgess_genus_species_dict[gs][0]
        cl = burgess_occs_df.loc[row_num]['class_name']
        order = burgess_occs_df.loc[row_num]['order_name']
        genus = burgess_occs_df.loc[row_num]['occurrence.genus_name']
        species = burgess_occs_df.loc[row_num]['occurrence.species_name']
        print(f'{cl}, {order}, {genus}, {species}', file=outfile)

In [13]:
kaili_occs_df.loc[8680]

collection_no                      79769
source_database                  PaleoDB
collection.authorizer           A. Hendy
class_name                   Eocrinoidea
class_extant                          no
order_name                       Gogiida
order_extant                         NaN
family_name                   Eocrinidae
family_extant                        NaN
genus_extant                         NaN
species_extant                        no
occurrence.genus_reso                NaN
occurrence.genus_name       Sinoeocrinus
occurrence.subgenus_reso             NaN
occurrence.subgenus_name             NaN
occurrence.species_reso              NaN
occurrence.species_name              lui
original.genus_reso                  NaN
original.genus_name                  NaN
original.subgenus_reso               NaN
original.subgenus_name               NaN
original.species_reso                NaN
original.species_name                NaN
occurrence.authorizer           A. Hendy
occurrence.enter