In [1]:
# Compute entity cooccurrence in Compact Memory
import os, json, pickle, pprint, math
import urllib.parse

In [2]:
cm_entities = pickle.load(open("/home/rovera/cm/scripts/generators/generator_all_entities/cm_tagme_resource_reference_data_06_04.pickle", 'rb'))
#cm_entities = pickle.load(open("/home/rovera/cm/scripts/generators/generator_all_entities/cm_tagme_resource_reference_data_05_03.pickle", 'rb'))
id_to_journal = json.load(open('/home/rovera/cm/scripts/cooccurrence/input/id_to_journal.json', 'r', encoding="utf-8"))
journal_to_id = json.load(open('/home/rovera/cm/scripts/cooccurrence/input/journal_to_id.json', 'r', encoding="utf-8"))

In [3]:
# unquote urls (only for general dataset)
for mention in cm_entities:
    mention['wiki'] = urllib.parse.unquote(mention['wiki'])
    mention['dbpedia'] = urllib.parse.unquote(mention['dbpedia'])

In [4]:
pprint.pprint(cm_entities[0:3])

[{'dbpedia': 'http://dbpedia.org/resource/Gustav_Wyneken',
  'end': 1730,
  'journal_id': '2710055',
  'link_prob': 1,
  'page_id': '2710055-2710056-2710060--083-2710159',
  'rho': 0.5213567614555359,
  'spot': 'Gustav Wyneken',
  'start': 1716,
  'wiki': 'https://de.wikipedia.org/wiki/Gustav_Wyneken'},
 {'dbpedia': 'http://dbpedia.org/resource/Ständigkeit',
  'end': 77,
  'journal_id': '2710055',
  'link_prob': 1,
  'page_id': '2710055-2710056-2710058--048-2710100',
  'rho': 0.5,
  'spot': 'ständig',
  'start': 70,
  'wiki': 'https://de.wikipedia.org/wiki/Ständigkeit'},
 {'dbpedia': 'http://dbpedia.org/resource/Über_Gewißheit',
  'end': 3897,
  'journal_id': '2710055',
  'link_prob': 1,
  'page_id': '2710055-2710056-2710058--048-2710100',
  'rho': 0.5,
  'spot': 'Gewißheit',
  'start': 3888,
  'wiki': 'https://de.wikipedia.org/wiki/Über_Gewißheit'}]


In [6]:
print(cm_entities[0])

{'resource': 'http://data.judaicalink.org/data/dbpedia/Franz_Mehring', 'ref': 'http://data.judaicalink.org/data/cm-tagme/1000000', 'spot': 'Franz Mehring', 'start': 1269, 'end': 1282, 'link_prob': 0.7148148417472839, 'rho': 0.4602566361427307, 'journal_id': '2710055', 'page_id': '2710055-2710056-2710057--019-2710121'}


In [28]:
def get_uri_name_mapping(cm_mentions, general=False):
    field = '' # field in cm_entities which identifies the resource
    if general:
        field = 'wiki'
    else:
        field = 'resource'
    uri_to_name = {mention[field]: mention[field].split('/')[-1] for mention in cm_entities}
    uri_to_name = {k: v.replace('_', ' ') for k,v in uri_to_name.items()}
    name_to_uri = {v: k for k,v in uri_to_name.items()}
    return uri_to_name, name_to_uri

def group_by_page(cm_entities, general=False):
    field = '' # field in cm_entities which identifies the resource
    if general:
        field = 'wiki'
    else:
        field = 'resource'
    ents_by_page = {}
    for mention in cm_entities:
        if mention['page_id'] not in ents_by_page:
            ents_by_page[mention['page_id']] = set([])
        ents_by_page[mention['page_id']].add((mention[field], int(mention['start']), int(mention['end'])))
    return ents_by_page


def group_by_journal(cm_entities, general=False):
    field = '' # field in cm_entities which identifies the resource
    if general:
        field = 'wiki'
    else:
        field = 'resource'
    occ_by_journal = {}
    for mention in cm_entities:
        if mention[field] not in occ_by_journal:
            occ_by_journal[mention[field]] = {}
        if mention['journal_id'] not in occ_by_journal[mention[field]]:
            occ_by_journal[mention[field]][mention['journal_id']] = 0
        occ_by_journal[mention[field]][mention['journal_id']] += 1
    return occ_by_journal

def get_occ_by_journal(resource_uri, occ_by_journal, id_to_journal, return_data=False):
    sorted_occ = sorted([(k,v) for k,v in occ_by_journal[resource_uri].items()], key=lambda x:x[1], reverse=True)
    if return_data == False:
        for journal in sorted_occ:
            print(id_to_journal[journal[0]], journal[1])
    else:
        return sorted_occ

def get_dataset_info(ents_by_page, entity_count):
    pages = len(ents_by_page)
    unique_entities = len(entity_count)
    mention_count = sum(len(v) for v in ents_by_page.values())
    avg = mention_count/len(ents_by_page)
    print("Found {} entity mentions in {} pages ({} mentions/page), {} unique entities.".format(mention_count, pages, avg, unique_entities))

def count_entity_occurrence(cm_entities, general=False):
    field = '' # field in cm_entities which identifies the resource
    if general:
        field = 'wiki'
    else:
        field = 'resource'
    occ = {}
    for mention in cm_entities:
        if mention[field] not in occ:
            occ[mention[field]] = 0
        occ[mention[field]] += 1
    return occ
        

def get_cooccurrence(ents_by_page, offset=400):
    cooc = {}
    for pool in ents_by_page.values():
        for entity in pool:
            if entity[0] not in cooc:
                cooc[entity[0]] = {}
            for cooccurrent in pool:
                if cooccurrent[0] != entity[0] and abs(cooccurrent[1]-entity[1]) < offset:
                    if cooccurrent[0] not in cooc[entity[0]]:
                        cooc[entity[0]][cooccurrent[0]] = 0
                    cooc[entity[0]][cooccurrent[0]] += 1
    return cooc

def get_top_10_cooccurrents(resource_uri, cooc_dict):
    if resource_uri not in cooc_dict:
        print("The resource {} is not available in the dataset.".format(resource_uri))
    else:
        sorted_cooccurrents = sorted([(k,v) for k,v in cooc_dict[resource_uri].items()], key=lambda x:x[1], reverse=True)
        for i in range(10):
            if i < len(sorted_cooccurrents):
                print(sorted_cooccurrents[i])
            

def get_pmi_correlation(entity_count, cooc):
    tot_entities = sum(entity_count.values())
    pmi_corr = {k: {} for k in entity_count.keys()}
    for occurrent, cooccurrents in cooc.items():
        for cooccurrent, abs_cooc in cooccurrents.items():
            occurrent_count = entity_count[occurrent]
            cooccurrent_count = entity_count[cooccurrent]
            pmi_value = math.log((abs_cooc/tot_entities)/((occurrent_count/tot_entities)*(cooccurrent_count/tot_entities)))
            pmi_corr[occurrent][cooccurrent] = pmi_value
    return pmi_corr

def get_custom_correlation(entity_count, cooc):
    custom_corr = {k: {} for k in entity_count.keys()}
    for occurrent, cooccurrents in cooc.items():
        for cooccurrent, abs_cooc in cooccurrents.items():
            occurrent_count = entity_count[occurrent]
            cooccurrent_count = entity_count[cooccurrent]
            # basic solution
            custom_value = abs_cooc/((occurrent_count+cooccurrent_count)/2)
            # cool solution
            #custom_value = ((abs_cooc/occurrent_count)*(abs_cooc/cooccurrent_count))/(abs_cooc/min(occurrent_count, cooccurrent_count))
            custom_corr[occurrent][cooccurrent] = custom_value
    return custom_corr

def get_top_10_custom_correlated(resource_uri, custom_corr):
    sorted_correlations = sorted([(k,v) for k,v in custom_corr[resource_uri].items()], key=lambda x:x[1], reverse=True)
    for i in range(10):
        if i < len(sorted_correlations):
            print(sorted_correlations[i])
    
def get_top_10_correlated(resource_uri, pmi_corr):
    sorted_correlations = sorted([(k,v) for k,v in pmi_corr[resource_uri].items()], key=lambda x:x[1], reverse=True)
    for i in range(10):
        if i < len(sorted_correlations):
            print(sorted_correlations[i])

In [4]:
# create mapping uri <--> entity name for each unique entity in the extracted dataset
uri_to_name, name_to_uri = get_uri_name_mapping(cm_entities, False)

In [5]:
len(name_to_uri)

6054

In [29]:
# count mentions
entity_count = count_entity_occurrence(cm_entities)

# group entities by page
ents_by_page = group_by_page(cm_entities)

# print dataset information
get_dataset_info(ents_by_page, entity_count)

# compute absoute cooccurrence (offset required, default 400 characters)
cooc_dict_abs = get_cooccurrence(ents_by_page, offset=300)

Found 1528126 entity mentions in 465854 pages (3.280268066819218 mentions/page), 6054 unique entities.


In [32]:
# compute correlation between entities using pointwise mutual information
pmi_corr_dict = get_pmi_correlation(entity_count, cooc_dict_abs)

In [31]:
# compute custom correlation between entities using custom measure
custom_corr_dict = get_custom_correlation(entity_count, cooc_dict_abs)
pickle.dump(custom_corr_dict, open('/home/rovera/cm/scripts/cooccurrence/output/entity_correlation.pickle', 'wb'))

In [8]:
# query for absolute cooccurrence
# example: http://data.judaicalink.org/data/dbpedia/Stefan_Zweig
get_top_10_cooccurrents('http://data.judaicalink.org/data/dbpedia/Rosa_Luxemburg', cooc_dict_abs)

('http://data.judaicalink.org/data/dbpedia/Walther_Rathenau', 13)
('http://data.judaicalink.org/data/dbpedia/Gustav_Landauer', 12)
('http://data.judaicalink.org/data/dbpedia/Moses_Hess', 8)
('http://data.judaicalink.org/data/dbpedia/Eduard_Bernstein', 7)
('http://data.judaicalink.org/data/dbpedia/Ludwig_Bamberger', 5)
('http://data.judaicalink.org/data/dbpedia/Wladimir_Iljitsch_Lenin', 5)
('http://data.judaicalink.org/data/dbpedia/Gabriel_Riesser', 4)
('http://data.judaicalink.org/data/dbpedia/Hugo_Preuß', 4)
('http://data.judaicalink.org/data/dbpedia/Wilhelm_II._(Deutsches_Reich)', 4)
('http://data.judaicalink.org/data/dbpedia/Johann_Jacoby', 3)


In [35]:
# query for pmi correlation
get_top_10_correlated('https://de.wikipedia.org/wiki/Gustav_Wyneken', pmi_corr_dict)

('https://de.wikipedia.org/wiki/Bernhard_Uffrecht', 11.58757935365817)
('https://de.wikipedia.org/wiki/Berthold_Otto', 10.894432173098224)
('https://de.wikipedia.org/wiki/Quers', 10.894432173098224)
('https://de.wikipedia.org/wiki/Georg_Stammler', 10.894432173098224)
('https://de.wikipedia.org/wiki/Arthur_Drey', 10.894432173098224)
('https://de.wikipedia.org/wiki/Landsgemeinde', 10.894432173098224)
('https://de.wikipedia.org/wiki/Georg_Bondi_Verlag', 10.48896706499006)
('https://de.wikipedia.org/wiki/Rudolf_Pannwitz', 10.48896706499006)
('https://de.wikipedia.org/wiki/Knud_Ahlborn', 10.48896706499006)
('https://de.wikipedia.org/wiki/Franz_Pfemfert', 10.334816385162801)


In [26]:
# query for custom correlation
get_top_10_custom_correlated('http://data.judaicalink.org/data/dbpedia/Hannah_Arendt', custom_corr_dict)

('http://data.judaicalink.org/data/dbpedia/Adolf_Leschnitzer', 0.03076923076923077)
('http://data.judaicalink.org/data/dbpedia/Gideon_Hausner', 0.02127659574468085)
('http://data.judaicalink.org/data/dbpedia/Salo_W._Baron', 0.015384615384615387)
('http://data.judaicalink.org/data/dbpedia/Martin_Heidegger', 0.015384615384615385)
('http://data.judaicalink.org/data/dbpedia/Walter_Benjamin', 0.015384615384615385)
('http://data.judaicalink.org/data/dbpedia/Michael_Löwy', 0.015384615384615385)
('http://data.judaicalink.org/data/dbpedia/Ze’ev_Levy', 0.015384615384615385)
('http://data.judaicalink.org/data/dbpedia/Raul_Hilberg', 0.015384615384615385)
('http://data.judaicalink.org/data/dbpedia/Manès_Sperber', 0.015384615384615385)
('http://data.judaicalink.org/data/dbpedia/Grigori_Isaakowitsch_Barenblatt', 0.015384615384615385)


In [16]:
# group entities by journal
ents_by_journal = group_by_journal(cm_entities, False)
pickle.dump(ents_by_journal, open('/home/rovera/cm/scripts/cooccurrence/output/entities_by_journal.pickle', 'wb'))

In [17]:
# query by journal
get_occ_by_journal('http://data.judaicalink.org/data/dbpedia/Stefan_Zweig', ents_by_journal, id_to_journal)

Jüdische Rundschau  214
Central-Verein-Zeitung  130
Jüdische Wochenschau 89
Jüdisch-liberale Zeitung / Vereinigung für das Liberale Judentum 78
Boletín informativo de la Sociedad Cultural Israelita B'ne Jisroel 71
Die Stimme  49
Bayerische israelitische Gemeindezeitung  44
Gemeindeblatt der Israelitischen Religionsgemeinde zu Leipzig  41
Wiener Morgenzeitung 38
Gemeindeblatt der Israelitischen Gemeinde Frankfurt am Main 29
Der Orden Bne Briss  26
Die Wahrheit  25
Der Morgen  22
Die Stimme  21
Das jüdische Echo  21
Der Israelit  20
Die neue Welt  19
Gemeinde-Zeitung für die israelitischen Gemeinden Württembergs 18
Der Jugendbund  16
Gemeindeblatt der Israelitischen Religionsgemeinde Dresden  16
Der junge Jude 14
Jeschurun  14
Jüdische allgemeine Zeitung 14
Israelitischer Jugendfreund  13
Die Welt  12
Breslauer jüdisches Gemeindeblatt  11
Jüdisches Volksblatt 11
Neue jüdische Presse  10
Jüdische Zeitung  9
Jüdische Schulzeitung  9
Das Echo  9
Menorah  8
Allgemeine Zeitung des Judenthums 

In [18]:
occurrence_by_journal = {}
for entity in entity_count:
    ent_occ_by_journal = get_occ_by_journal(entity, ents_by_journal, id_to_journal, return_data=True)
    occurrence_by_journal[entity] = ent_occ_by_journal
pickle.dump(occurrence_by_journal, open('/home/rovera/cm/scripts/cooccurrence/output/occurrence_by_journal.pickle', 'wb'))