# Sherwin Carlquist Collection - record retrieval

This script retrieves public records of the Sherwin Carlquist Collection (SJCC) on the Portal to Texas History. Various fields and values are extracted and saved to a CSV file. This CSV file can be uploaded to a collection in Symbiota using the Extended Data Importer (see https://biokic.github.io/symbiota-docs/coll_manager/upload/links/).

In [4]:
# Run this line if using cloud notebook like Google Colab
!pip install sickle

In [12]:
import re

from sickle import Sickle
import pandas as pd

# static values for relationship records
accordingTo = 'TBD'
basisOfRecord = 'TBD' #Not sure how/if we'll use this 

# change to False if you want all PTH records, even if there are no relationships indicated
relationships_only = True

# URL regex pattern
url_pattern = re.compile(r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)")
# OCCID regex pattern
occid_pattern = re.compile(r'occid=(?P<occid>\d+)')
# RSA catalogNumber pattern
# TODO adapt RSA regex to work with wood and other catalog number formats
rsa_catnum_pattern = re.compile(r'RSA\d+')

In [13]:
sickle = Sickle('https://texashistory.unt.edu/oai')
records = sickle.ListRecords(metadataPrefix='oai_dc', set='collection:SJCC')

In [14]:
rec_count = 0
relation_count = 0

data = []
for record in records:
    rec_count += 1
    # get ARK
    rec_identifier = record.header.identifier
    ark = rec_identifier.split(':')[1]
    #PTH URL
    resourceUrl = 'https://texashistory.unt.edu/' + ark
    #metadata = record.metadata
    #format = record.metadata.get('format')
    #description = record.metadata.get('description')
    #xml_str = ET.tostring(record.xml, encoding='unicode')
    title = record.metadata.get('title')
    subject = record.metadata.get('subject')
    date = record.metadata.get('date')
    type = record.metadata.get('type')
    identifier = record.metadata.get('identifier')
    relation = record.metadata.get('relation')
    
    if relation:
        relation_count += 1
        relation_string = relation[0]
        # extract relation_url
        url_match = url_pattern.search(relation_string)
        # extract RSA catalog number
        rsa_catnum_match = rsa_catnum_pattern.search(relation_string)
        if rsa_catnum_match:
            catalogNumber = rsa_catnum_match[0]
        else:
            catalogNumber = None
        if url_match:
            relation_url = url_match[0]
            #OCCID
            occid_match = occid_pattern.search(relation_url)
            if occid_match:
                occid = occid_match.group('occid')
            else:
                occid = None
        else:
            relation_url = None
            
        if 'same individual' in relation_string:
            relation_type = 'specimen'
        elif 'population' in relation_string:
            relation_type = 'population'
        else:
            relation_type = 'undefined'

    else:
        relation_type = None
        relation_url = None

    brit_id = None
    for id in identifier:
        if 'local-cont-no' in id:
            brit_id_kv = id.split(':')
            brit_id = brit_id_kv[1].strip()

    #TODO - add accordingTo when that value is determined
    # occurrenceID isn't avialable in URL, only OCCID. 
    # May need to extract catalog number from relation string
    if relationships_only:
        if relation:
            data.append({'catalogNumber': catalogNumber, 'occid': occid, 'objectID': brit_id, 'resourceUrl': resourceUrl, 'ark': ark, 'title': title, 'date': date,  'relation': relation, 'relation_url': relation_url, 'relation_type': relation_type })
    else:
        data.append({'catalogNumber': catalogNumber, 'occid': occid, 'objectID': brit_id, 'resourceUrl': resourceUrl, 'ark': ark, 'title': title, 'date': date,  'relation': relation, 'relation_url': relation_url, 'relation_type': relation_type })

print('rec_count', rec_count)
print('relation_count', relation_count)


rec_count 2963
relation_count 103


In [16]:
# convert list to dataframe
df = pd.DataFrame(data)

if relationships_only:
    filename = 'sjcc_relations.csv'
else:
    filename = 'sjcc_all.csv'

df.to_csv(filename, index=False)
print('Results saved to:', filename)

import os
# test if running in Google Colab
if os.getenv("COLAB_RELEASE_TAG"):
    print('File output is stored in the Colab filesystem accessible in the File pane to the left')
else:
    print('File output stored in the same directory as this notebook')

Results saved to: sjcc_relations.csv
File output stored in the same directory as this notebook
