In [48]:
# open rism data file and only save important info

import os
import re
from collections import defaultdict

CUR_DIR = os.getcwd()
RISM_DIR = os.path.join(CUR_DIR, 'rismAllMARCXML')
RISM_FILE = 'rism_170316.xml'
RISM_INSTRONLY = 'rism_instronly.xml'

In [14]:
# First run a little test before processing the entire file

with open(os.path.join(RISM_DIR, RISM_FILE)) as myfile:
    head = [next(myfile) for x in range(500)]

    in_240 = False
    in_594 = False
    for line in head:       
        # print record delimiters
        if 'record' in line:
            print(line.strip())

        # print identifier. You can generate a link from this: https://opac.rism.info/id/rismid/000051649
        if 'tag="001"' in line:
            print(line.strip())

        # print instrumentation if in instrumentation tag of uniform_title (240)  
        if 'code="m"' in line and in_240:
            print(line.strip())

        # print instrumentation if in instrumentation tag of instrumentation detail (594, RISM-specific?)  
        if 'code="a"' in line and in_594:
            print(line.strip())    

        # set instrumentation tag flag when encountered    
        if 'tag="240"' in line:
            in_240 = True

        # unset instrumentation tag when tag active and field closed
        if '</datafield>' in line and in_240:
            in_240 = False

        # set instrumentation tag flag when encountered    
        if 'tag="594"' in line:
            in_594 = True

        # unset instrumentation tag when tag active and field closed
        if '</datafield>' in line and in_594:
            in_594 = False    

<record>
<controlfield tag="001">000051650</controlfield>
<subfield code="m">S, strings, cor (2)</subfield>
<subfield code="a">S, vl (2), b, cor (2)</subfield>
</record><record>
<controlfield tag="001">000051651</controlfield>
<subfield code="m">S, orch</subfield>
<subfield code="a">S, vl (2), vla, b, ob, fag, cor</subfield>
</record><record>
<controlfield tag="001">000051652</controlfield>
<subfield code="m">S, orch</subfield>
<subfield code="a">S, vl (2), vla solo, vla, b, ob (2), cl (2), cor (2)</subfield>
</record><record>
<controlfield tag="001">000051653</controlfield>
<subfield code="m">V (2), orch</subfield>
<subfield code="a">S, B, vl (2), vla, b, fag, cor (2), tr (2)</subfield>
</record><record>
<controlfield tag="001">000051654</controlfield>
<subfield code="m">S, strings</subfield>


In [24]:
# now process the entire RISM file. Might take a while!

with open(os.path.join(RISM_DIR, RISM_INSTRONLY), 'w') as outfile:
    with open(os.path.join(RISM_DIR, RISM_FILE)) as myfile:
        # head = [next(myfile) for x in range(500)]
        # to process part of file, uncomment line above and replace 'myfile' with 'head' below
        in_240 = False
        in_594 = False
        for line in myfile:       
            # print record delimiters
            if 'record' in line:
                outfile.write(line)

            # print identifier. You can generate a link from this: https://opac.rism.info/id/rismid/000051649
            if 'tag="001"' in line:
                outfile.write(line)

            # print instrumentation if in instrumentation tag of uniform_title (240)  
            if 'code="m"' in line and in_240:
                outfile.write(line)

            # print instrumentation if in instrumentation tag of instrumentation detail (594, RISM-specific?)  
            if 'code="a"' in line and in_594:
                outfile.write(line)    

            # set instrumentation tag flag when encountered    
            if 'tag="240"' in line:
                in_240 = True

            # unset instrumentation tag when tag active and field closed
            if '</datafield>' in line and in_240:
                in_240 = False

            # set instrumentation tag flag when encountered    
            if 'tag="594"' in line:
                in_594 = True

            # unset instrumentation tag when tag active and field closed
            if '</datafield>' in line and in_594:
                in_594 = False   
            
# check that the outfile was written as it should
with open(os.path.join(RISM_DIR, RISM_INSTRONLY)) as myfile:
    head = [next(myfile) for x in range(10)]
    
    for line in head:
        print(line.strip())

<record>
<controlfield tag="001">000051650</controlfield>
<subfield code="m">S, strings, cor (2)</subfield>
<subfield code="a">S, vl (2), b, cor (2)</subfield>
</record><record>
<controlfield tag="001">000051651</controlfield>
<subfield code="m">S, orch</subfield>
<subfield code="a">S, vl (2), vla, b, ob, fag, cor</subfield>
</record><record>
<controlfield tag="001">000051652</controlfield>


In [97]:
# function to clean up the vocabulary
def cleanvocab(mystring):
    # remove everything after a ()
    mystring = mystring.split('(', 1)[0]
    
    # remove any numbers in the string
    mystring = re.sub('\d+', '', mystring)
    
    # remove any invalid characters
    mystring = re.sub('[)\?":\[\]]', '', mystring)

    # remove the word 'solo' --> TODO later: replace by (1) ?
    mystring = re.sub('solo', '', mystring)
    
    # remove the word 'ad lib.'
    mystring = re.sub('ad lib.', '', mystring)
    
    # remove the word 'ad lib'
    mystring = re.sub('ad lib', '', mystring)
 
    # remove additional whitespace between words
    mystring = re.sub('\s\s', ' ', mystring)
    
    # if the string starts with an invalid character, just delete it
    if len(mystring) > 0:
        if mystring[0] == '.' or mystring[0] == '-':
            mystring = ''
        
    if len(mystring) >= 20:
        mystring = ''

    return mystring.strip()
    

In [98]:
# get some stats out of this data. Create vocabularies and dictionaries.

with open(os.path.join(RISM_DIR, RISM_INSTRONLY)) as myfile:
    #head = [next(myfile) for x in range(100)]
    
    totalfilecount = 0
    currentid = ''
    instr_summ = ''
    instr_full = ''
    
    vocab_summ = set()
    vocab_full = set()
    
    rindex_summ = defaultdict(list)
    rindex_full = defaultdict(list)
    
    count_summ = defaultdict(int)
    count_full = defaultdict(int)
    
    for line in myfile: #head:
        # look up the ID
        found_001 = re.match('.+>(\d+)</controlfield>', line)
        if found_001:
            currentid = found_001[1]
            totalfilecount += 1
            continue     # skip over rest of loop
        
        # look up instrumentation summary
        found_240 = re.match('.+code="m">(.+)</.*', line)
        if found_240:
            instr_summ = found_240[1]
            #print(instr_summ)
            
            # split up the instrumentation summary and put it in a set --> create vocab
            instruments = [cleanvocab(x.strip()) for x in instr_summ.split(',')]      # CleanVocab happens here
            vocab_summ.update(instruments)
            
            # create reverse index and count occurrences
            for i in instruments:
                rindex_summ[i].append(currentid)
                count_summ[i] += 1
                
            # skip over rest of loop
            continue    
            
        # look up full instrumentation    
        found_594 = re.match('.+code="a">(.+)</.*', line)
        if found_594:
            instr_full = found_594[1]
            #print(instr_full)
            
            # split up the instrumentation summary and put it in a set --> create vocab
            instruments = [cleanvocab(x.strip()) for x in instr_full.split(',')]      # CleanVocab happens here
            vocab_full.update(instruments)
            
            # create reverse index and count occurrences
            for i in instruments:
                rindex_full[i].append(currentid)
                count_full[i] += 1
                
            # skip over rest of loop
            continue 
    
    # delete instrumentations that only occur less than x times
    MIN_OCCURRENCE = 5
    summ_todel = [ k for k,v in count_summ.items() if v < MIN_OCCURRENCE ]
    for item in summ_todel:
        vocab_summ.remove(item)
        del rindex_summ[item]
        del count_summ[item]
        
    full_todel = [ k for k,v in count_full.items() if v < MIN_OCCURRENCE ]
    for item in full_todel:
        vocab_full.remove(item)
        del rindex_full[item]
        del count_full[item]    
    
    
    print(sorted(vocab_summ), '\n\n', sorted(vocab_full), '\n\n')
    # print(rindex_full)

    print(sorted(count_summ), '\n\n', sorted(count_full), '\n\n')
    
    # turns out there's no such thing as a trumpet in F# ?
    
    print("Dataset contains ", totalfilecount, " scores with ", len(vocab_summ), " summary instruments and ", len(vocab_full), " detailed instruments occurring at least ", MIN_OCCURRENCE, " times.")
     

['', '-', '- Bühnenmusik Ob.', '- Hfe. - Str.', '- Pk.', '- Pk. - Str.', '- Str.', '- klTr. - Harm.', 'A', 'A echo', 'Alphr.', 'B', 'B -', 'B - Klv.', 'B / Chor', 'B echo', 'B-Bariton', 'B; Coro II S', 'Bar. - Orch.', 'Bariton', 'Basse-contre', 'Basse-taille', 'Baß-Pos.', 'Bck.', 'Bck. - Hfe.', 'Bck. - Hfe. - Str.', 'Bck. - Str.', 'Bhr.', 'Bkl.', 'Btrp.', 'Cel.', 'Cel. - Str.', 'Cemb.', 'Cemb. - Str.', 'Chartophon', 'Chor', 'Choro', 'Contra-A', 'Contra-T', 'Coro', 'Coro A', 'Coro B', 'Coro I S', 'Coro S', 'Coro SATB', 'Coro T', 'Coro di fanciulli', 'Coro doppio', 'Coro fanciulli', 'Coro feminile', 'Coro femminile', 'Coro maschile', 'Coro orch', 'Coro pf', 'Coro unisono', 'Coro virile', 'Dessus', 'Donnermaschine', 'Dudelsäcke', 'EHr.', 'Fg.', 'Gemeinde', 'Glasharmonika', 'Glsp.', 'Harfenuhr', 'Harm.', 'Haute-contre', 'Hautecontre', 'Hph.', 'Hr.', 'I S -', 'Infimus', 'Kast.', 'Kb.', 'Kfg.', 'Kfg. -', 'Kl.', 'Klv.', 'Klv. - Str.', 'Medius', 'Mezzo-S', 'Ob.', 'Orch', 'Orchester', 'Org.', '

In [99]:
# see https://elk-docker.readthedocs.io/ for ELK stack on docker
# then: sudo docker run -p 5601:5601 -p 9200:9200 -p 5044:5044 -it --name elk sebp/elk
# then: http://localhost:5601
# to login: on the host, first check name of container: sudo docker ps
# then: sudo docker exec -it elk /bin/bash      (elk is the name here)



In [None]:
# up next: write the data in a log-file format for logstash file plugin?
# or differently, in some sort of json format?

# --> using python elasticsearch bindings, save directly in elasticsearch. 
# Then see if there's no easy way to create a form on top of that.