In [1]:
from helper import *
from tqdm import tqdm

In [7]:
def create_doc_tree(filename):
    docx_file = filename + '.docx'

    document = Document(docx_file)
    id_ = 0
    text = {}
    adj_list = {}
    parent = {}
    prev_titles = []
    for para in document.paragraphs:
        if para.text:
            id_ += 1
            # get size
            size = 0
            if para.style.font.size != None:
                size = para.style.font.size/12700
            for run in para.runs:
                if run.font.size:
                    size = max(size, run.font.size/12700)
            # get prev title list
            while len(prev_titles) and prev_titles[-1]['size'] <= size:
                prev_titles.pop()
            if prev_titles:
                adj_list[prev_titles[-1]['id']].append(id_)
                parent[id_] = prev_titles[-1]['id']
            else:
                parent[id_] = None
            prev_titles.append({'size': size, 'id': id_})
            adj_list[id_] = []
            text[id_] = preprocess_paragraph(para.text)
    return text, adj_list, parent

def create_tree(document):
    id_ = 0
    text = {}
    adj_list = {}
    parent = {}
    prev_titles = []
    for element in document['content']:
        id_ += 1
        # get size
        size = element[1]
        # get prev title list
        while len(prev_titles) and prev_titles[-1]['size'] >= size:
            prev_titles.pop()
        if prev_titles:
            adj_list[prev_titles[-1]['id']].append(id_)
            parent[id_] = prev_titles[-1]['id']
        else:
            parent[id_] = None
        prev_titles.append({'size': size, 'id': id_})
        adj_list[id_] = []
        text[id_] = preprocess_paragraph(element[0])
    return text, adj_list, parent

def create_kg(documents):
    subject_edges = set([])
    modifier_edges = set([])
    subject_modifier_edges = set([])
    relation_edges = []
    other_edges = set([])

    did = 0
    tid = 10000
    pid = 20000
    sid = 30000
    eid = 40000
    xid = 50000
    count = 0
    eid_to_text = {}
    text_to_eid = {}
    sid_to_text = {}
    text_to_sid = {}
    pid_to_text = {}
    text_to_pid = {}
    tid_to_text = {}
    text_to_tid = {}
    did_to_text = {}
    text_to_did = {}
    xid_to_text = {}
    text_to_xid = {}
    # for each document get the paragraphs and topics
    for doc in tqdm(documents):
        if (doc['format'] != 'html' or not 'body' in doc) and doc['format'] != 'docx':
            continue
        if doc['name'] in text_to_did:
            continue
        text_to_did[doc['name']] = did
        did_to_text[did] = doc
        did += 1
        # read document in python
        if doc['format'] == 'html':
            text, adj_list, parent = create_tree(doc)
        else:
            text, adj_list, parent = create_doc_tree(doc['file'])
        # traverse doc tree to identify paragraphs and topics in the document
        for node in adj_list:
            if adj_list[node]:
                if text[node] not in text_to_tid:
                    text_to_tid[text[node]] = tid
                    tid_to_text[tid] = text[node]
                    tid += 1
                if parent[node]:
                    if text[parent[node]] not in text_to_tid:
                        text_to_tid[text[parent[node]]] = tid
                        tid_to_text[tid] = text[parent[node]]
                        tid += 1
                    other_edges.add((text_to_tid[text[node]], 'about_concept', text_to_tid[text[parent[node]]]))
            else:
                if text[node] not in text_to_pid:
                    text_to_pid[text[node]] = pid
                    pid_to_text[pid] = text[node]
                    pid += 1
                if parent[node]:
                    if text[parent[node]] not in text_to_tid:
                        text_to_tid[text[parent[node]]] = tid
                        tid_to_text[tid] = text[parent[node]]
                        tid += 1
                    other_edges.add((text_to_pid[text[node]], 'about_concept', text_to_tid[text[parent[node]]]))
                    other_edges.add((text_to_pid[text[node]], 'from_document', text_to_did[doc['name']]))
    # convert topics to lower case
    for tid in tqdm(tid_to_text):
        tid_to_text[tid] = {
            'text': tid_to_text[tid].lower(),
            'keywords': [k[0] for k in find_keywords(tid_to_text[tid].lower())],
            'tags': list(set([tag for k in find_keywords(tid_to_text[tid].lower()) for tag in k[1]]))
        }
    
    # for each paragraph get sentences
    for pid in tqdm(pid_to_text):
        sentences = split_into_sentences(pid_to_text[pid])
        for sentence in sentences:
            sentence = {
                'text': sentence, 
                'stemmed_tokens': get_stemmed_sentence_tokens(sentence)
            }
            if sentence['text'] not in text_to_sid:
                text_to_sid[sentence['text']] = sid
                sid_to_text[sid] = sentence
                sid += 1
            other_edges.add((pid, 'contains_sentence', text_to_sid[sentence['text']]))
    
    # for each sentence get extractions
    for sid in tqdm(sid_to_text):
        extractions = extract(sid_to_text[sid]['text'])
        if extractions:
            count += 1
            for extraction in extractions:
                eid_to_text[eid] = extraction
                other_edges.add((sid, 'contains_extraction', eid))
                eid += 1
        else:
            keywords = find_keywords(sid_to_text[sid]['text'])
            for keyword in keywords:
                if keyword[0] not in text_to_xid:
                    text_to_xid[keyword[0]] = xid
                    xid_to_text[xid] = keyword
                    xid += 1
                other_edges.add((sid, 'about_entity', text_to_xid[keyword[0]]))
    
    # canonicalise the obtained extractions
    canonicalise(eid_to_text)
    
    # for each extraction create entities and relations
    for eid in tqdm(eid_to_text):
        ext = eid_to_text[eid]
        
        if ext['subject'][0] not in text_to_xid:
            text_to_xid[ext['subject'][0]] = xid
            xid_to_text[xid] = ext['subject']
            xid += 1
        
        subject_edges.add((eid, 'subject', text_to_xid[ext['subject'][0]]))

        if ext['object'][0] not in text_to_xid:
            text_to_xid[ext['object'][0]] = xid
            xid_to_text[xid] = ext['object']
            xid += 1

        relation_edges.append((eid, ext['relation'], text_to_xid[ext['object'][0]], list(ext['rel_synsets'])))
        
        for modifier in ext['modifiers']:
            if modifier['m_obj'][0] not in text_to_xid:
                text_to_xid[modifier['m_obj'][0]] = xid
                xid_to_text[xid] = modifier['m_obj']
                xid += 1

            modifier_edges.add((eid, modifier['m_rel'], text_to_xid[modifier['m_obj'][0]]))
            
        for subject_modifier in ext['subject_modifiers']:
            if subject_modifier['m_obj'][0] not in text_to_xid:
                text_to_xid[subject_modifier['m_obj'][0]] = xid
                xid_to_text[xid] = subject_modifier['m_obj']
                xid += 1

            subject_modifier_edges.add((eid, subject_modifier['m_rel'], text_to_xid[subject_modifier['m_obj'][0]]))
    
    print(did, tid, pid, sid, eid, xid, count, round(10000 * count/sid)/100)
    
    offset = {
        'documents': 0,
        'topics': len(did_to_text),
        'paragraphs': len(did_to_text) + len(tid_to_text),
        'sentences': len(did_to_text) + len(tid_to_text) + len(pid_to_text),
        'extractions': len(did_to_text) + len(tid_to_text) + len(pid_to_text) + len(sid_to_text),
        'entities':  len(did_to_text) + len(tid_to_text) + len(pid_to_text) + len(sid_to_text) + len(eid_to_text)
    }
    
    vertices = {
        'documents': list({'id': k, 'text': did_to_text[k]['name'], 'source': did_to_text[k]['link']} for k in did_to_text.keys()),
        'topics': list({'id': k, 'text': tid_to_text[k]['text'], 'keywords': tid_to_text[k]['keywords'], 'tags': tid_to_text[k]['tags']} for k in tid_to_text.keys()),
        'paragraphs': list({'id': k, 'text': pid_to_text[k]} for k in pid_to_text.keys()),
        'sentences': list({'id': k, 'text': sid_to_text[k]['text'], 'stemmed_tokens': sid_to_text[k]['stemmed_tokens']} for k in sid_to_text.keys()),
        'extractions': list({'id': k, 'body': eid_to_text[k]} for k in eid_to_text.keys()),
        'entities': list({'id': k, 'text': xid_to_text[k][0], 'tags': xid_to_text[k][1], 'tokens': xid_to_text[k][2]} for k in xid_to_text.keys())
    }
    
    triples = {
        'main': list(other_edges),
        'subjects': list(subject_edges),
        'modifiers': list(modifier_edges),
        'subject_modifiers': list(subject_modifier_edges),
        'relations': list(relation_edges)
    }
    return vertices, triples

In [8]:
triples = None
vertices = None
doc_names = ["UG Regulations", "Academic Dishonesty Policy | IIIT-Delhi", "Evaluation Policy | IIIT-Delhi", "Placement Procedure & Policies | IIIT-Delhi", "Green Policy | IIIT-Delhi", "Privacy Policy | IIIT-Delhi", "Student Conduct Policy | IIIT-Delhi", "Refund / Cancellation Policy | IIIT-Delhi", "Hostel Policies | IIIT-Delhi", "Allocation Policies | IIIT-Delhi", "Internships @ IIIT-D | IIIT-Delhi", "Disciplinary Action | IIIT-Delhi", "Facility Management Services | IIIT-Delhi", "B.Tech. Fee Waiver | IIIT-Delhi"]
documents = read_json('../data/files/iiit_website_content.json')[:16] + [{'name': 'UG Regulations', 'file': './../data/files/UG-Regulations', 'format': 'docx', 'link': 'https://www.iiitd.ac.in/sites/default/files/docs/education/2019/2019-July-UG-Regulations.pdf'}]
vertices, triples = create_kg(documents)
print('did, tid, pid, sid, eid, count, percentage')

100%|██████████| 17/17 [00:11<00:00,  1.52it/s]
100%|██████████| 126/126 [00:09<00:00, 12.63it/s]
100%|██████████| 295/295 [00:10<00:00, 28.49it/s]
100%|██████████| 562/562 [00:44<00:00, 12.58it/s]
ERROR: wish
ERROR: maintain
ERROR: describe
ERROR: take
ERROR: read
ERROR: understand
ERROR: ask
ERROR: say
ERROR: say
ERROR: use
ERROR: use
ERROR: allow
ERROR: verbatim
ERROR: submit
ERROR: assume
ERROR: expect
ERROR: use
ERROR: require
ERROR: involve
ERROR: should
ERROR: clarify
ERROR: carry
ERROR: award
ERROR: award
ERROR: carry
ERROR: use
ERROR: award
ERROR: convert
ERROR: mean
ERROR: replace
ERROR: reflect
ERROR: reflect
ERROR: count
ERROR: indicate
ERROR: send
ERROR: fill
ERROR: fill
ERROR: date
ERROR: provide
ERROR: allot
ERROR: participate
ERROR: confirm
ERROR: confirm
ERROR: should
ERROR: do
ERROR: should
ERROR: do
ERROR: maintain
ERROR: provide
ERROR: convert
ERROR: convert
ERROR: handle
ERROR: handle
ERROR: handle
ERROR: create
ERROR: create
ERROR: endanger
ERROR: endanger
ERROR: 

In [9]:
write_json({'vertices': vertices, 'edges': triples}, '../neo4j/iiit_website_graph.json')