# Sherwin Carlquist Collection - record retrieval

This script retrieves public records of the Sherwin Carlquist Collection (SJCC) on the Portal to Texas History. Various fields and values are extracted and saved to a CSV file.

In [1]:
import re

from sickle import Sickle
import xml.etree.ElementTree as ET
import pprint
import pandas as pd

sickle = Sickle('https://texashistory.unt.edu/oai')
#records = sickle.ListRecords(metadataPrefix='oai_dc')

In [2]:
# other metadata formats:
# initiall some counts were different but that changed after a few queries
#untl - 2159
# oai_dc - 2157
# untl_raw - 2159
# untl_dpla - 2159
# rdf - 2159

records = sickle.ListRecords(metadataPrefix='oai_dc', set='collection:SJCC')

In [3]:
relationships_only = True

In [4]:
rec_count = 0
relation_count = 0
# set up URL regex
url_pattern = re.compile(r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)")

data = []
for record in records:
    #print(record.header)
    rec_count += 1
    # get ARK
    rec_identifier = record.header.identifier
    ark = rec_identifier.split(':')[1]
    pth_url = 'https://texashistory.unt.edu/' + ark
    metadata = record.metadata
    title = record.metadata.get('title')
    #creator = record.metadata.get('creator')
    subject = record.metadata.get('subject')
    description = record.metadata.get('description')
    date = record.metadata.get('date')
    type = record.metadata.get('type')
    format = record.metadata.get('format')
    identifier = record.metadata.get('identifier')
    #coverage = record.metadata.get('coverage')
    relation = record.metadata.get('relation')
    xml_str = ET.tostring(record.xml, encoding='unicode')
    
    if relation:
        relation_count += 1
        #print(relation)
        relation_string = relation[0]
        #relation_url
        url_match = url_pattern.search(relation_string)
        if url_match:
            relation_url = url_match[0]
        else:
            relation_url = None
        #pprint.pprint(record.header)
        if 'same individual' in relation_string:
            relation_type = 'specimen'
        elif 'population' in relation_string:
            relation_type = 'population'
        else:
            relation_type = 'undefined'
        #print(relation_string, relation_type)
    else:
        #print('no relation')
        relation_type = None
        relation_url = None
    #print('relation_type:', relation_type)

    brit_id = None
    for id in identifier:
        #print(id)
        if 'local-cont-no' in id:
            brit_id_kv = id.split(':')
            brit_id = brit_id_kv[1].strip()
            #print(brit_id)
    # add row
    # intermediate detail
    # data.append({'rec_identifier': rec_identifier, 'ark': ark, 'metadata': metadata, 'title': title, 'subject': subject, 'description': description, 'date': date, 'type': type, 'format': format, 'identifiers': identifier, 'relation': relation, 'xml_str': xml_str})
    # simple detail
    data.append({'brit_id': brit_id, 'pth_url': pth_url, 'ark': ark, 'title': title, 'date': date,  'relation': relation, 'relation_url': relation_url, 'relation_type': relation_type })



# convert list to dataframe
df = pd.DataFrame(data)

        
print('rec_count', rec_count)
print('relation_count', relation_count)

rec_count 2738
relation_count 100


In [5]:
df.to_csv('sjcc_all.csv', index=False)