In [2]:
# given a document, we use spacy NER for extracting mentions of entities
# input: plain text
# output: mention dictionary, a set of (mention, start, end) tuples
# CAVEAT: for the purpose of entity linking this is not the best option as it only considers Named Entities and it
# relies on the performance of the used tool (in this case spacy NER). Nevertheless, it provides a solution for quick 
# and dirty experiments

import spacy, de_core_news_sm, os, time, json
root = "/home/rovera/cm/"
root_data = root+'data/'

In [3]:
nlp = de_core_news_sm.load() # load model for german

In [9]:
mentions = {} # structure: {'journal': {'filename': [tuple1, tuple2, ...]}
processed = 0
start = time.time()

for journal in os.listdir(root_data+'pages/'):
    print('Processing', journal)
    mentions[journal] = {}
    
    for page in os.listdir(root_data+'pages/'+journal+'/'):
        
        page_as_key = page.split('.')[0]
        mentions[journal][page_as_key] = []
        
        #print("Processing:", page)
        text = open(root_data+'pages/'+journal+'/'+page, "r", encoding="utf-8").read()
        
        doc = nlp(text)
        
        for ent in doc.ents:
            
            mention = tuple((ent.text, ent.start_char, ent.end_char, ent.label_))
            mentions[journal][page_as_key].append(mention)
            
            #print(ent.text, ent.start_char, ent.end_char, ent.label_)
            
        processed += 1
    
        

with open(root+'output/mention_detection/detected_through_NER.json', "w", encoding="utf-8") as outfile:
    json.dump(mentions, outfile,  ensure_ascii=False, indent=2)

end = time.time()
print("Processed", processed, "files in ", round(end-start, 1), "seconds, that is, approx",
      round(processed/(end-start)), "files per second.")

Processing 9620162
Processing 3139318
Processing 2566634
Processed 1001 files in  120.0 seconds, that is, approx 8 files per second.
