# Perform coreference resolution

In [1]:
import json

def read_text(filename):
    raw_text = ''
    with open(filename) as file:
        for line in file:
            raw_text += line
    return raw_text

def write_text(text, filename):
    with open(filename, 'w') as file:
        for line in text:
            file.write(line)
            
def read_json(filename):
    with open(filename) as file:
        data = json.load(file)
    return data

def write_json(data, filename):
    with open(filename, 'w') as file:
        json.dump(data, file)

## Manually define canonicals and replace in text

In [2]:
def replace_predefined_canonicals(triples, canonicals):
    for sentence in triples:
        for e in range(len(triples[sentence])):
            for word in canonicals:
                if triples[sentence][e]['subject'] in canonicals[word]:
                    triples[sentence][e]['subject'] = word
                for obj in range(len(triples[sentence][e]['object'])):
                    if triples[sentence][e]['object'][obj] in canonicals[word]:
                        triples[sentence][e]['object'][obj] = word


## Group canonicals and replace with one element of group

In [3]:
class DSU:
    def __init__(self, array):
# Code for DSU
        self.parent = [i for i in range(len(array))]
        self.size = [1 for i in range(len(array))]
    def find(self, x):
        p = x
        while p != self.parent[p]:
            p = self.parent[p]
        temp = x
        while temp != self.parent[temp]:
            t = self.parent[temp]
            self.parent[temp] = p
            temp = t
        return p 

    def combine(self, x, y):
        x = self.find(x)
        y = self.find(y)
        if x == y:
            return
        if self.size[x] > self.size[y]:
            self.parent[y] = x
            self.size[x] += self.size[y]
        else:
            self.parent[x] = y
            self.size[y] += self.size[x]

# similar if word overlap is greater than 50%
def similar(entity1, entity2):
    words1 = list(set(entity1.split()))
    words2 = list(set(entity2.split()))
    common = 0
    for word in words1:
        common += words2.count(word)
    return (2 * common > 0.5 * (len(words1) + len(words2)))


def cluster_similar_words(triples):
    # All entities
    entities = set([])
    for sentence in triples:
        for extraction in triples[sentence]:
            entities.add(extraction['subject'])
            for obj in extraction['object']:
                entities.add(obj)
    entities = list(entities)

    dsu = DSU(entities)

    # combine similar words
    for e1 in range(len(entities)):
        for e2 in range(len(entities)):
            if similar(entities[e1], entities[e2]):
                dsu.combine(e1, e2)

    parent_of_word = {}
    for e in range(len(entities)):
        parent_of_word[entities[e]] = entities[dsu.find(e)]

    # replace entity by parent entity
    for sentence in triples:
        for e in range(len(triples[sentence])):
            triples[sentence][e]['subject'] = parent_of_word[triples[sentence][e]['subject']]
        for obj in range(len(triples[sentence][e]['object'])):
            triples[sentence][e]['object'][obj] = parent_of_word[triples[sentence][e]['object'][obj]]
    
    groups = {}
    for word in parent_of_word:
        if parent_of_word[word] not in groups:
            groups[parent_of_word[word]] = set([])
        groups[parent_of_word[word]].add(word)
    for word in groups:
        if len(groups[word]) == 1:
            continue
        print(word)
        print(groups[word])
    
    for e in range(len(entities)):
        entities[e] = parent_of_word[entities[e]]
    print('original entities:', len(entities))
    entities = list(set(entities))
    print('reduced entities:', len(entities))

## Create relation "similar to" between canonicals

In [4]:
# mark entity 1 as similar to the entity 2
def get_similar_to_edges(triples):
    # All entities
    entities = set([])
    for sentence in triples:
        for extraction in triples[sentence]:
            entities.add(extraction['subject'])
            for obj in extraction['object']:
                entities.add(obj)
    entities = list(entities)
    
    similar_to_edges = []
    for e1 in range(len(entities)):
        for e2 in range(len(entities)):
            if e1 == e2:
                continue
            if similar(entities[e1], entities[e2]):
                similar_to_edges.append([entities[e1], 'similar to', entities[e2]])
    
    return similar_to_edges

# Canonicalisation using CESI

In [13]:
from nltk.stem import WordNetLemmatizer
import requests

API_ENDPOINT = "https://www.wikidata.org/w/api.php"

def prepare_cesi_triples(triples, filename):
    wordnet_lemmatizer = WordNetLemmatizer()
    cesi_triples = []
    id_ = 0
    count_linked = 0
    count_unlinked = 0
    for sentence in triples:
        for extraction in triples[sentence]:
            for sub in extraction['subject']:
                for rel in extraction['relation']:
                    for obj in extraction['object']:
                        # Get true link of the subject and object from wikidata
                        sub_true_link = requests.get(API_ENDPOINT, params = {
                                    'action': 'wbsearchentities',
                                    'format': 'json',
                                    'language': 'en',
                                    'search': sub
                                }).json()['search']
                        if len(sub_true_link) > 0:
                            sub_true_link = sub_true_link[0]['id']
                            count_linked += 1
                        else:
                            sub_true_link = None
                            count_unlinked += 1
                        ob_true_link = requests.get(API_ENDPOINT, params = {
                                    'action': 'wbsearchentities',
                                    'format': 'json',
                                    'language': 'en',
                                    'search': obj
                                }).json()['search']
                        if len(ob_true_link) > 0:
                            ob_true_link = ob_true_link[0]['id']
                            count_linked += 1
                        else:
                            ob_true_link = None
                            count_unlinked += 1
                        print('sub:', sub, sub_true_link, 'obj:', obj, ob_true_link)
                        triple = {
                            '_id': id_,
                            'triple': [
                                sub, 
                                rel, 
                                obj
                            ],
                            'triple_norm': [
                                ' '.join([wordnet_lemmatizer.lemmatize(word) for word in sub.split()]), 
                                ' '.join([wordnet_lemmatizer.lemmatize(word) for word in rel.split()]), 
                                ' '.join([wordnet_lemmatizer.lemmatize(word) for word in obj.split()]), 
                            ],
                            'true_link': {
                                'subject': sub_true_link,
                                'object': ob_true_link

                            },
                            'src_sentences': [sentence],
                            'entity_linking': {
            #                     'subject': entity_links[extraction['subject']],
            #                     'object': entity_links[extraction['object']],
                            },
                            'kbp_info': []
                        }
                        id_ += 1
                        cesi_triples.append(triple)
    write_text('\n'.join([json.dumps(triple) for triple in cesi_triples]), filename)
    print('linked:', count_linked, 'unlinked:', count_unlinked)

In [14]:
canonicals = {
    'hostel student': ['resident student', 'resident students', 'hostel student', 'hostel students', 'hostel resident', 'hostel residents', 'hosteller', 'hostellers'],
    'student': ['student', 'students'],
    'instructor': ['instructor', 'professor', 'faculty'],
    'campus': ['campus', 'on campus', 'in campus', 'inside campus', 'iiitd', 'in iiitd', 'inside iiitd', 'iiitd campus', 'in iiitd campus', 'inside iiitd campus', 'in college', 'inside college']
}



triples = read_json('../data/ollie_triples.json')
cluster_similar_words(triples)
# write_json(triples, '../data/ollie_canonicalised_2_triples.json')

# triples = read_json('../data/ollie_triples.json')
# similar_edges = get_similar_to_edges(triples)
# print(similar_edges[:10])

iiitd campus
{'vehicles entering iiitd campus', 'campus', 'visitors entering campus', 'iiitd campus'}
neha dhiman
{'neha dhiman', 'neha'}
faculty and staff and students using pathway and faculty visitors
{'faculty or staff and students using their vehicles', 'faculty and staff and students using pathway and faculty visitors'}
gate
{'main gate', 'gate'}
such complaints
{'such complaints', 'complaints'}
visitor
{'visitor register', 'entry', 'visitor', 'visitor entry'}
food delivery boys vehicle
{'food delivery boys', 'delivery of courier', 'food delivery boys vehicle', 'courier boys vehicle', 'courier boys', 'delivery of food'}
visitors vehicles
{'visitors vehicles', 'vehicles'}
vehicle reported by faculty or officers or sc or security for violation of above guidelines
{'vehicle reported by faculty or officers or sc or security for violation of above guidelines', 'faculty or officers or sc or security'}
facade glass
{'facade glass', 'glass'}
gate no 3
{'no 3', 'gate no 3'}
helpdesk
{'hel

In [15]:
triples = read_json('../data/my_extractions.json')

words = set([])
relations = set([])
for sentence in triples:
    for triple in triples[sentence]:
        for sub in triple['subject']:
            words.add(sub)
        for obj in triple['object']:
            words.add(obj)
        for rel in triple['relation']:
            relations.append(rel)
        for m in triple['modifiers1']:
            words.add(m['m_obj'])
            relations.add(m['m_rel'])
words = '. \n'.join(words)
write_text(words, '../data/my_entities.txt')


In [17]:
# run java -Xmx16g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,entitylink -file ../data/my_entities.txt
# entity_links = read_json('../data/entity_links.json')
prepare_cesi_triples(triples, '../data/my_cesi/my_cesi_triples')
# # replace_predefined_canonicals(triples, canonicals)
# write_json(triples, '../data/ollie_canonicalised_1_triples.json')

sub: Visitor entry None obj: 8 am Q41618176
sub: Visitors Q1076940 obj: access Q80689
sub: All visitors None obj: CCTV surveillance Q101069887
sub: number plate Q22706 obj: camera Q15328
sub: Gate No 3 None obj: open Q2735683
sub: The entry Q5452326 obj: Faculty Q180958
sub: The entry Q5452326 obj: staff Q703534
sub: The entry Q5452326 obj: Students Q48282
sub: The vehicles Q100476087 obj: Gate No 1 None
sub: The Faculty Q373267 obj: IIITD Campus None
sub: their visitors None obj: Gate No 3 None
sub: The vehicles Q100476087 obj: The vehicles Q100476087
sub: IIITD Campus None obj: No Horn Zone None
sub: IIITD Campus None obj: maximum speed Q1077350
sub: hr Q224 obj: permissible Q4376583
sub: All Faculty None obj: sticker Q2872553
sub: staff Q703534 obj: sticker Q2872553
sub: students Q48282 obj: sticker Q2872553
sub: All visitors vehicles None obj: necessary registration None
sub: Any vehicle None obj: Faculty Q180958
sub: Any vehicle None obj: Officers Q4340308
sub: Any vehicle None ob

FileNotFoundError: [Errno 2] No such file or directory: '../data/my_cesi/my_cesi_triples'