In [35]:
# Open rism data file and only save important info

# To use by yourself:
# 1. Download the RISM data (all MarcXML) from https://opac.rism.info/index.php?id=8&L=1 
# 2. Save this file and update the RISM_DIR variable to point to its directory
# 3. Update the RISM_FILE variable with its filename

import os
import re
import json
from collections import defaultdict

CUR_DIR = os.getcwd()
RISM_DIR = os.path.join(CUR_DIR, '../hamr_2018/rismAllMARCXML')
RISM_FILE = 'rism_170316.xml'
RISM_INSTRONLY = 'rism_instronly.xml'
RISM_JSON = 'rism_instronly.json'

In [22]:
# First run a little test before processing the entire file

with open(os.path.join(RISM_DIR, RISM_FILE)) as myfile:
    head = [next(myfile) for x in range(500)]

    in_240 = False
    in_594 = False
    for line in head:       
        # print record delimiters
        if 'record' in line:
            print(line.strip())

        # print identifier. You can generate a link from this: https://opac.rism.info/id/rismid/000051649
        if 'tag="001"' in line:
            print(line.strip())

        # print instrumentation if in instrumentation tag of uniform_title (240)  
        if 'code="m"' in line and in_240:
            print(line.strip())

        # print instrumentation if in instrumentation tag of instrumentation detail (594, RISM-specific?)  
        if 'code="a"' in line and in_594:
            print(line.strip())    

        # set instrumentation tag flag when encountered    
        if 'tag="240"' in line:
            in_240 = True

        # unset instrumentation tag when tag active and field closed
        if '</datafield>' in line and in_240:
            in_240 = False

        # set instrumentation tag flag when encountered    
        if 'tag="594"' in line:
            in_594 = True

        # unset instrumentation tag when tag active and field closed
        if '</datafield>' in line and in_594:
            in_594 = False    

<record>
<controlfield tag="001">000051650</controlfield>
<subfield code="m">S, strings, cor (2)</subfield>
<subfield code="a">S, vl (2), b, cor (2)</subfield>
</record><record>
<controlfield tag="001">000051651</controlfield>
<subfield code="m">S, orch</subfield>
<subfield code="a">S, vl (2), vla, b, ob, fag, cor</subfield>
</record><record>
<controlfield tag="001">000051652</controlfield>
<subfield code="m">S, orch</subfield>
<subfield code="a">S, vl (2), vla solo, vla, b, ob (2), cl (2), cor (2)</subfield>
</record><record>
<controlfield tag="001">000051653</controlfield>
<subfield code="m">V (2), orch</subfield>
<subfield code="a">S, B, vl (2), vla, b, fag, cor (2), tr (2)</subfield>
</record><record>
<controlfield tag="001">000051654</controlfield>
<subfield code="m">S, strings</subfield>


In [23]:
# now process the entire RISM file. Might take a while!

with open(os.path.join(RISM_DIR, RISM_INSTRONLY), 'w') as outfile:
    with open(os.path.join(RISM_DIR, RISM_FILE)) as myfile:
        # head = [next(myfile) for x in range(500)]
        # to process part of file, uncomment line above and replace 'myfile' with 'head' below
        in_240 = False
        in_594 = False
        for line in myfile: #head:      
            # print record delimiters
            if 'record' in line:
                outfile.write(line)

            # print identifier. You can generate a link from this: https://opac.rism.info/id/rismid/000051649
            if 'tag="001"' in line:
                outfile.write(line)

            # print instrumentation if in instrumentation tag of uniform_title (240)  
            if 'code="m"' in line and in_240:
                outfile.write(line)

            # print instrumentation if in instrumentation tag of instrumentation detail (594, RISM-specific?)  
            if 'code="a"' in line and in_594:
                outfile.write(line)    

            # set instrumentation tag flag when encountered    
            if 'tag="240"' in line:
                in_240 = True

            # unset instrumentation tag when tag active and field closed
            if '</datafield>' in line and in_240:
                in_240 = False

            # set instrumentation tag flag when encountered    
            if 'tag="594"' in line:
                in_594 = True

            # unset instrumentation tag when tag active and field closed
            if '</datafield>' in line and in_594:
                in_594 = False   
            
# check that the outfile was written as it should
with open(os.path.join(RISM_DIR, RISM_INSTRONLY)) as myfile:
    head = [next(myfile) for x in range(10)]
    
    for line in head:
        print(line.strip())

<record>
<controlfield tag="001">000051650</controlfield>
<subfield code="m">S, strings, cor (2)</subfield>
<subfield code="a">S, vl (2), b, cor (2)</subfield>
</record><record>
<controlfield tag="001">000051651</controlfield>
<subfield code="m">S, orch</subfield>
<subfield code="a">S, vl (2), vla, b, ob, fag, cor</subfield>
</record><record>
<controlfield tag="001">000051652</controlfield>


In [31]:
# function to clean up the vocabulary
# parameter withnumber: to keep the count of performers in the instrument name or not
# parameter maxlength: maximum length of an instrument string

def cleanvocab(mystring, withnumber=True, maxlength=20):
    
    if not withnumber:
        # remove everything after a '('
        mystring = mystring.split('(', 1)[0]
        
        # remove the word 'solo' --> TODO later: replace by (1) ?
        mystring = re.sub('solo', '', mystring)
        
        # remove any numbers in the string
        mystring = re.sub('\d+', '', mystring)
    
        # remove any invalid characters
        mystring = re.sub('[)\?\'":\[\]\*\+]', '', mystring)
    
    else:
        # replace the word 'solo' by (1) ?
        mystring = re.sub('solo', '(1)', mystring)
        
        # remove any numbers in the beginning of the string
        # TODO deal with this later
        # mystring = re.sub('\d+', '', mystring)
    
        # remove any invalid characters
        mystring = re.sub('[\?\'":\[\]\*\+]', '', mystring)
    
    # remove the word 'ad lib.'
    mystring = re.sub('ad lib.', '', mystring)
    
    # remove the word 'ad lib'
    mystring = re.sub('ad lib', '', mystring)
 
    # remove additional whitespace between words
    mystring = re.sub('\s\s', ' ', mystring)
    
    # if the string starts with an invalid character, just delete it
    if len(mystring) > 0:
        if mystring[0] == '.' or mystring[0] == '-':
            mystring = ''
        
    if len(mystring) >= maxlength:
        mystring = ''

    return mystring.strip()
    
    
# TODO translation of shorthand notations to full name, or the other way round    

In [32]:
# get some stats out of this data. Create vocabularies and dictionaries.

# whether we want to include numbers (of performers) in the instrument name
INCLUDE_PERF = True
# max nr of characters for an instrument name. Advice: add 4 more if INCLUDE_PERF=True
MAX_INSTRLEN = 24 
# how many times does an instrument need to occur before we include it?
MIN_OCCURRENCE = 5 

with open(os.path.join(RISM_DIR, RISM_INSTRONLY)) as myfile:
    #head = [next(myfile) for x in range(100)]
    # to process part of file, uncomment line above and replace 'myfile' with 'head' below
    
    totalfilecount = 0
    currentid = ''
    instr_summ = ''
    instr_full = ''
    
    vocab_summ = set()
    vocab_full = set()
    
    rindex_summ = defaultdict(list)
    rindex_full = defaultdict(list)
    
    count_summ = defaultdict(int)
    count_full = defaultdict(int)
    
    for line in myfile: #head:
        # look up the ID
        found_001 = re.match('.+>(\d+)</controlfield>', line)
        if found_001:
            currentid = found_001[1]
            totalfilecount += 1
            continue     # skip over rest of loop
        
        # look up instrumentation summary
        found_240 = re.match('.+code="m">(.+)</.*', line)
        if found_240:
            instr_summ = found_240[1]
            #print(instr_summ)
            
            # split up the instrumentation summary and put it in a set --> create vocab
            instruments = [cleanvocab(x.strip(), INCLUDE_PERF, MAX_INSTRLEN) for x in instr_summ.split(',')]      # CleanVocab happens here
            vocab_summ.update(instruments)
            
            # create reverse index and count occurrences
            for i in instruments:
                rindex_summ[i].append(currentid)
                count_summ[i] += 1
                
            # skip over rest of loop
            continue    
            
        # look up full instrumentation    
        found_594 = re.match('.+code="a">(.+)</.*', line)
        if found_594:
            instr_full = found_594[1]
            #print(instr_full)
            
            # split up the instrumentation summary and put it in a set --> create vocab
            instruments = [cleanvocab(x.strip(), INCLUDE_PERF, MAX_INSTRLEN) for x in instr_full.split(',')]      # CleanVocab happens here
            vocab_full.update(instruments)
            
            # create reverse index and count occurrences
            for i in instruments:
                rindex_full[i].append(currentid)
                count_full[i] += 1
                
            # skip over rest of loop
            continue 
    
    # delete instrumentations that only occur less than x times
    summ_todel = [ k for k,v in count_summ.items() if v < MIN_OCCURRENCE ]
    for item in summ_todel:
        vocab_summ.remove(item)
        del rindex_summ[item]
        del count_summ[item]
        
    full_todel = [ k for k,v in count_full.items() if v < MIN_OCCURRENCE ]
    for item in full_todel:
        vocab_full.remove(item)
        del rindex_full[item]
        del count_full[item]    
    
    
    print(sorted(vocab_summ), '\n\n', sorted(vocab_full), '\n\n')
    # print(rindex_full)

    print(sorted(count_summ), '\n\n', sorted(count_full), '\n\n')
    
    # turns out there's no such thing as a trumpet in F# ?
    
    print("Dataset contains ", totalfilecount, " scores with ", len(vocab_summ), " summary instruments and ", len(vocab_full), " detailed instruments occurring at least ", MIN_OCCURRENCE, " times.")
     

['', '(pf)', '0', '0 - Hfe. - Str.', '0 - Pk.', '0 - Pk. - Str.', '0 - klTr. - Harm.', '1', '1 - 4 Pk.', '1 - Pk.', '1 - Pk. - Hfe. - Str.', '1 - Pk. - Str.', '1 Fg.', '1 Ob.', '1 Paar Sch. - Hfe.', '10', '10 V', '11 V', '11 i', '12', '12 V', '12 i', '13 V', '14', '14 V', '15 V', '16', '16 V', '16 Vl. II', '2', '2 (2. auch Bkl.)', '2 (2. auch EHr.)', '2 (2. auch Picc.)', '2 - 2', '2 - 3', '2 - 4', '2 A', '2 B', '2 Bhr.', '2 Cel. - Str. (16 Vl. I', '2 Coro', '2 Dessus', '2 Kb.)', '2 Kl.', '2 Paar Kast.', '2 S', '2 T', '2 T (2 S)', '2 Tamb.', '2 Treble', '2 Trp.', '2 V', '2 a-vla', '2 arp', '2 bc', '2 cemb', '2 cl', '2 cl (2 fl)', '2 cl (2 ob)', '2 cl.picc', '2 clav', '2 clno', '2 clno (2 cor)', '2 cnto', '2 cnto (2 vl)', '2 cor', '2 cor (2 clno)', '2 cor di bassetto', '2 dessus', '2 fag', '2 fag (2 fl)', '2 fag (2 vlc)', '2 fifre', '2 fl', '2 fl (2 musette)', '2 fl (2 ob)', '2 fl (2 vielle)', '2 fl (2 vl)', '2 fl dolce', '2 fl.picc', '2 flageolet', '2 guit', '2 i', '2 lituus', '2 lituus

In [27]:
# up next: write the data in a log-file format for logstash file plugin?
# or differently, in some sort of json format?

# --> using python elasticsearch bindings, save directly in elasticsearch. 
# Then see if there's no easy way to create a form on top of that.

In [None]:
# convert all records to JSON

def makejson(field001, field240, field594):
    
    if not field001:
        return
    if not field240 and not field594:
        return
    
    #TODO check field240 and field594 are arrays?
    #TODO check what happens when data is empty
    myobj = {
        "id": field001
    }
    if field240:
        myobj['instr_summ'] = field240
    if field594:
        myobj['instr_full'] = field594
        
    return myobj

In [42]:
# Convert to JSON and dump to JSON file
# This may take a minute or 2!

jsonarray = []

with open(os.path.join(RISM_DIR, RISM_INSTRONLY)) as myfile:
    # head = [next(myfile) for x in range(100)]
    # to process part of file, uncomment line above and replace 'myfile' with 'head' below

    currentid = ''
    instr_summ = ''
    instr_summ_arr = []
    instr_full = ''
    instr_full_arr = []

    for line in myfile:

        # look up the ID
        found_001 = re.match('.+>(\d+)</controlfield>', line)
        if found_001:
            # write the currently stored data to file, if it exists
            myjsonobject = makejson(currentid, instr_summ_arr, instr_full_arr)
            if myjsonobject:
                jsonarray.append( myjsonobject )
            # start next record
            currentid = found_001[1]
            instr_summ = ''       # reset instrumentation fields
            instr_summ_arr = []
            instr_full = ''
            instr_full_arr = []
            continue     # skip over rest of loop

        # look up instrumentation summary
        found_240 = re.match('.+code="m">(.+)</.*', line)
        if found_240:
            instr_summ = found_240[1]

            # split up the instrumentation summary 
            instr_summ_arr = [cleanvocab(x.strip(), INCLUDE_PERF, MAX_INSTRLEN) for x in instr_summ.split(',')]      # CleanVocab happens here

            # skip over rest of loop
            continue    

        # look up full instrumentation    
        found_594 = re.match('.+code="a">(.+)</.*', line)
        if found_594:
            instr_full = found_594[1]
            #print(instr_full)

            # split up the instrumentation summary and put it in a set --> create vocab
            instr_full_arr = [cleanvocab(x.strip(), INCLUDE_PERF, MAX_INSTRLEN) for x in instr_full.split(',')]      # CleanVocab happens here

            # skip over rest of loop
            continue 

    # add the very last record
    myjsonobject = makejson(currentid, instr_summ_arr, instr_full_arr)
    if myjsonobject:
        jsonarray.append( myjsonobject )

        
# write the array of JSON objects to file 
with open(os.path.join(RISM_DIR, RISM_JSON), 'w') as outfile:
    json.dump(jsonarray, outfile)