In [1]:
import os, json, pickle, pprint, csv

In [2]:
with open('/data/cm/data/metadata/CM_Seiten_Metadaten.csv', 'r') as infile:
    metadata = list(csv.reader(infile, delimiter="\t"))
metadata = {line[0]: line[1:] for line in metadata}

In [3]:
print(metadata['2710121'])

['|2710055||2710056||2710057||2710121|', '2710055', '2710057', 'Metadaten', 'journal issue', 'Der neue Anfang', '8 (1835)', 'Heft 1 (1.1.1919)', '', '1919-01-01', '17', '', '(null)']


In [4]:
journal_metadata = json.load(open("/data/cm/data/journal_metadata/journal_metadata_title_lang.json", 'r'))

In [5]:
for k,v in journal_metadata.items():
    print(k)
    print(v)
    break

10112841
{'title': 'Beilage in Wiedergutmachungsfragen', 'lang': 'Deutsch'}


In [6]:
# keep the journal title only
id_to_journal = {k: v['title'].split(':')[0].rstrip(' ') for k,v in journal_metadata.items()}
journal_to_id = {v: k for k,v in id_to_journal.items()}

In [7]:
cm_entities = pickle.load(open("cm_tagme_resource_reference_data_05_03.pickle", 'rb'))

In [8]:
print(cm_entities[0])

{'resource': 'http://data.judaicalink.org/data/dbpedia/Franz_Mehring', 'ref': 'http://data.judaicalink.org/data/cm-tagme/1000000', 'spot': 'Franz Mehring', 'start': 1269, 'end': 1282, 'link_prob': 0.7148148417472839, 'rho': 0.4602566361427307, 'journal_id': '2710055', 'page_id': '2710055-2710056-2710057--019-2710121'}


In [9]:
# group by journal
occ_by_journal = {}
for mention in cm_entities:
    uri = mention['resource'] # 'http://data.judaicalink.org/data/dbpedia/Franz_Mehring'
    
    if uri not in occ_by_journal:
        occ_by_journal[uri] = {}
    
    j_id = mention['journal_id'] # '2710055'
    j_title = id_to_journal[j_id] # "Die Welt:...."
    if j_title != '':
        if (j_id, j_title) not in occ_by_journal[uri]:
            occ_by_journal[uri][(j_id, j_title)] = {'data': [], 'first': 0, 'last': 0}
        page_id = mention['page_id'].split('--')[1].split('-')[-1]
        try:
            meta = metadata[page_id] # full metadata for this page
        
            spot = mention['spot']
            start = mention['start']
            end = mention['end']
        
            p_link = ''+page_id
        
            date = meta[9]
            try:
                year = int(date.split('-')[0])
            except ValueError:
                year = -1
                
            
            occ_by_journal[uri][(j_id, j_title)]['data'].append({
                'p_id': page_id,
                'spot': spot,
                'start': start,
                'end': end,
                'p_link': p_link,
                'date': date,
                'year': year
            })
        except KeyError: # we don't have metadata for this page
            pass
    

In [10]:
# add first/last mention date for each journal
for uri, val in occ_by_journal.items():
    for journal, entries in val.items():
        try:
            years = [item['year'] for item in entries['data']]
            #print(years)
            occ_by_journal[uri][journal]['first'] = min(years)
            occ_by_journal[uri][journal]['last'] = max(years)
        except ValueError:
            occ_by_journal[uri][journal]['first'] = 0
            occ_by_journal[uri][journal]['last'] = 0

In [11]:
# sort mentions by year ()
for uri, val in occ_by_journal.items():
    for journal, entries in val.items():
        mentions = occ_by_journal[uri][journal]['data']
        occ_by_journal[uri][journal]['data'] = sorted(mentions, key=lambda x:x['year'])

In [12]:
for uri, val in occ_by_journal.items():
    if uri == 'http://data.judaicalink.org/data/dbpedia/Hans_Frank':
        for k,v in val.items():
            print(k)
            pprint.pprint(v)
            break
        break

('2446951', 'Der Israelit')
{'data': [{'date': '1922-09-14',
           'end': 248,
           'p_id': '2527233',
           'p_link': '2527233',
           'spot': 'Hans Frank',
           'start': 238,
           'year': 1922},
          {'date': '1923-05-31',
           'end': 9739,
           'p_id': '2527837',
           'p_link': '2527837',
           'spot': 'Hans Frank',
           'start': 9729,
           'year': 1923},
          {'date': '1933-08-10',
           'end': 2613,
           'p_id': '2537580',
           'p_link': '2537580',
           'spot': 'Hans Frank',
           'start': 2603,
           'year': 1933},
          {'date': '1935-05-09',
           'end': 163,
           'p_id': '2539134',
           'p_link': '2539134',
           'spot': 'Hans Frank',
           'start': 153,
           'year': 1935}],
 'first': 1922,
 'last': 1935}


In [13]:
pickle.dump(occ_by_journal, open('./output/occ_by_journal_detail.pickle', 'wb'))