In [1]:
import os, json, pickle, pprint

In [78]:
class Cooccurrence():
    
    def __init__(self):
        self.raw_data = None
        self.ents_by_page = None
        self.abs_cooc = None
        self.pmi_cooc = None
    
    def get_stats(self):
        if self.ents_by_page == None:
            print("Please first group entities by page.")
        else:
            pages = len(self.ents_by_page)
            mentions_n = sum(len(v) for v in self.ents_by_page.values())
            avg = mentions_n/len(self.ents_by_page)
            print("Found {} mentions in {} pages ({} mentions/page).".format(mentions_n, pages, avg))
    
    def load_input_data(self, datapath):
        print("Loading input...", end=" ")
        self.raw_data = pickle.load(open(datapath, 'rb'))
        print("Done!")
    
    def group_by_page(self, verbose=True):
        if self.raw_data == None:
            print("Maybe you forgot to load the data?!")
        else:
            self.ents_by_page = {}
            for mention in self.raw_data:
                if mention['page_id'] not in self.ents_by_page:
                    self.ents_by_page[mention['page_id']] = set([])
                self.ents_by_page[mention['page_id']].add((mention['resource'], int(mention['start']), int(mention['end'])))
            if verbose:
                self.get_stats()
            
    def get_abs_cooccurrence(self, offset, verbose=True):
        if self.ents_by_page == None:
            print("Please first group entities by page.")
        else:
            self.abs_cooc = {}
            for pool in self.ents_by_page.values():
                for entity in pool:
                    if entity[0] not in self.abs_cooc:
                        self.abs_cooc[entity[0]] = {}
                    for cooccurrent in pool:
                        if cooccurrent[0] != entity[0] and abs(cooccurrent[1]-entity[1]) < offset:
                            if cooccurrent[0] not in self.abs_cooc[entity[0]]:
                                self.abs_cooc[entity[0]][cooccurrent[0]] = 0
                            self.abs_cooc[entity[0]][cooccurrent[0]] += 1
            if verbose:
                print("Computed absolute entity cooccurrence.")
                
    def get_top_10_cooccurrents(self, resource_uri):
        if resource_uri not in self.abs_cooc:
            print("This entity is not in the dataset :(")
        else:
            sorted_cooccurrents = sorted([(k,v) for k,v in self.abs_cooc[resource_uri].items()], key=lambda x:x[1], reverse=True)
            pprint.pprint(sorted_cooccurrents)
            #for i in range(10):
            #    if i <= len(sorted_cooccurrents):
            #        print(sorted_cooccurrents[i])

In [79]:
co = Cooccurrence()
co.load_input_data('cm_tagme_resource_reference_data_05_03.pickle')

Loading input... Done!


In [80]:
co.group_by_page()

Found 1528126 mentions in 465854 pages (3.280268066819218 mentions/page).


In [81]:
co.get_abs_cooccurrence(400)

Computed absolute entity cooccurrence.


In [85]:
co.get_top_10_cooccurrents('http://data.judaicalink.org/data/dbpedia/Arnold_Schönberg')

[('http://data.judaicalink.org/data/dbpedia/Gustav_Mahler', 19),
 ('http://data.judaicalink.org/data/dbpedia/Darius_Milhaud', 10),
 ('http://data.judaicalink.org/data/dbpedia/Israel', 6),
 ('http://data.judaicalink.org/data/dbpedia/Max_Brod', 4),
 ('http://data.judaicalink.org/data/dbpedia/Alban_Berg', 4),
 ('http://data.judaicalink.org/data/dbpedia/Breslau', 3),
 ('http://data.judaicalink.org/data/dbpedia/Sigmund_Freud', 3),
 ('http://data.judaicalink.org/data/dbpedia/Buenos_Aires', 3),
 ('http://data.judaicalink.org/data/dbpedia/Bertha_Badt-Strauss', 2),
 ('http://data.judaicalink.org/data/dbpedia/Ismar_Elbogen', 2),
 ('http://data.judaicalink.org/data/dbpedia/Karl_Wolfskehl', 2),
 ('http://data.judaicalink.org/data/dbpedia/Heinrich_Berl', 2),
 ('http://data.judaicalink.org/data/dbpedia/Budapest', 2),
 ('http://data.judaicalink.org/data/dbpedia/Tunis', 2),
 ('http://data.judaicalink.org/data/dbpedia/Los_Angeles', 2),
 ('http://data.judaicalink.org/data/dbpedia/Henri_Bergson', 2),
 ('