# Process LCSH
Code to extract Library of Congress Subject Headings (LCSH) from the [Library of Congress](https://id.loc.gov/authorities/subjects.html) bulk download of MADS/RDF JSONLD files. Data was downloaded on May 22nd, 2024. 

The file *'subjects.madsrdf.jsonld'* should be saved to this folder to run the code.

### Imports

In [1]:
import json
import pickle as pk

### Extract LCSH

In [22]:
def extractLCSH(file):
    lcsh = {}
    for idx, line in enumerate(open(file)):
        if idx%50000==0:
            print(f'{idx} records processed')
        line = json.loads(line)
        # Fefine variables
        termID = line['@id'][22:]
        heading, lcc, yearNew, yearRev, yearDep, kind = None, None, None, None, None, None
        deletionNote, bt, nt, syns, formerHead, lang = None, None, None, None, None, None
        altTerms = {}
        for record in line['@graph']:
            if record['@id'][39:] == termID:
                #Current Subject Heading 
                if 'madsrdf:Authority' in record['@type']:
                    heading = record['madsrdf:authoritativeLabel']['@value']
                    lang = record['madsrdf:authoritativeLabel']['@language']
                    kind = [t[8:] for t in record['@type'] if t[8:]!= 'Authority'][0]
                    if 'madsrdf:hasBroaderAuthority' in record:
                        # Broader Terms
                        bt = record['madsrdf:hasBroaderAuthority']
                        if type(bt) is list:
                            bt = [term['@id'][39:] for term in bt]
                        else:
                            bt = [bt['@id'][39:]]
                    if 'madsrdf:hasNarrowerAuthority' in record:
                        # Narrower Terms
                        nt = record['madsrdf:hasNarrowerAuthority']
                        if type(nt) is list:
                            nt = [term['@id'][39:] for term in nt]
                        else:
                            nt = [nt['@id'][39:]]
                    if 'madsrdf:hasVariant' in record:
                        # Synonyms of a term
                        syns = record['madsrdf:hasVariant']
                        if type(syns) is list:
                            syns = [term['@id'] for term in syns]
                        else:
                            syns = [syns['@id']]
                    if 'madsrdf:hasEarlierEstablishedForm' in record:
                        # Former Headings
                        formerHead = record['madsrdf:hasEarlierEstablishedForm']
                        if type(formerHead) is list:
                            formerHead = [term['@id'] for term in formerHead]
                        else:
                            formerHead = [formerHead['@id']]  
                # Deprecated heading          
                elif 'madsrdf:DeprecatedAuthority' in record['@type']:
                    heading = '_' + record['madsrdf:variantLabel']['@value']
                    lang = record['madsrdf:variantLabel']['@language']
                    kind = [t[8:] for t in record['@type'] if t[8:] != 'DeprecatedAuthority' and t[8:] !='Variant'][0]
                    if 'madsrdf:deletionNote' in record:
                        # Reason for deletion
                        deletionNote = record['madsrdf:deletionNote']
                # This shouldn't happen 
                else:
                    break 
            # If has associated library of congress classification
            if "lcc:ClassNumber" in record['@type']:
                lcc = record['madsrdf:code']
            # Collect date information
            if 'ri:RecordInfo' in record['@type']:
                if record['ri:recordStatus'] == 'new':
                    yearNew = record['ri:recordChangeDate']['@value']
                elif record['ri:recordStatus'] == 'revised':
                    yearRev =  record['ri:recordChangeDate']['@value']
                elif record['ri:recordStatus'] == 'deprecated':
                    yearDep = record['ri:recordChangeDate']['@value']
            # Collect potential variants of a term
            if '_:n' in record['@id'] and 'madsrdf:Variant' in record['@type']:
                altTerms[record['@id']] = record['madsrdf:variantLabel']['@value']
        # Term ids replaced with term for thos not linking to a subject headings
        if formerHead is not None:
            formerHead = [altTerms[i] for i in formerHead]
        if syns is not None:
            syns = [altTerms[i] for i in syns]
        if heading is not None: # Should never be None, but just in case
            lcsh[termID] = {'heading': heading,
                            'lang': lang,
                            'formerHead': formerHead, 
                            'lcc': lcc,
                            'type': kind,
                            'yearNew': yearNew,
                            'yearRev': yearRev,
                            'yearDep': yearDep,
                            'bt': bt,
                            'nt': nt,
                            'synonyms': syns,
                            'deleteNote': deletionNote}
    print(f'----------------\n{idx+1} records processed!')
    return lcsh
    

In [23]:
lcsh = extractLCSH("subjects.madsrdf.jsonld")

0 records processed
50000 records processed
100000 records processed
150000 records processed
200000 records processed
250000 records processed
300000 records processed
350000 records processed
400000 records processed
450000 records processed
500000 records processed
----------------
515970 records processed!


### Example Data

In [19]:
lcsh['sh00000203']

{'heading': 'Arakelov theory',
 'lang': 'en',
 'formerHead': None,
 'lcc': 'QA242.6',
 'type': 'Topic',
 'yearNew': '2000-01-10T00:00:00',
 'yearRev': '2000-04-03T10:42:54',
 'yearDep': None,
 'bt': ['sh87002041'],
 'nt': None,
 'synonyms': ['Arakelov geometry'],
 'deleteNote': None}

In [20]:
lcsh['sh87002041']

{'heading': 'Arithmetical algebraic geometry',
 'lang': 'en',
 'formerHead': None,
 'lcc': 'QA242.5-QA242.6',
 'type': 'Topic',
 'yearNew': '1987-04-02T00:00:00',
 'yearRev': '2000-02-28T14:10:20',
 'yearDep': None,
 'bt': ['sh85093222'],
 'nt': ['sh00000203', 'sh2001008362', 'sh93007485', 'sh94001868'],
 'synonyms': ['Geometry, Diophantine',
  'Geometry, Arithmetical algebraic',
  'Diophantine geometry',
  'Arithmetic algebraic geometry',
  'Algebraic geometry, Arithmetical'],
 'deleteNote': None}

### Save Data

In [24]:
with open('lcsh.pickle', 'wb') as f:
    pk.dump(lcsh, f, protocol=pk.HIGHEST_PROTOCOL)