# Imports

In [3]:
import spacy
import re
import json

nlp = spacy.load('en_core_web_md')

def read_text(filename):
    raw_text = ''
    with open(filename) as file:
        for line in file:
            raw_text += line
    return raw_text

def write_text(text, filename):
    with open(filename, 'w') as file:
        for line in text:
            file.write(line)
            
def read_json(filename):
    with open(filename) as file:
        data = json.load(file)
    return data

def write_json(data, filename):
    with open(filename, 'w') as file:
        json.dump(data, file)

# Extract subjects, relations and objects using regex

## Regex      
```
(?P<sentence>
    (?:
        (?P<modifiers0>
            ((?:A+(?:[C,]+A+)*)(?:N+(?:[C,]+N+)*)?)*
        ),
    )?(?P<subject>
        N+(?:CN+)*
    )[,]?(?P<modifiers1>
        ((?:A+(?:CA+)*)(?:N+(?:CN+)*))*
    )(?P<relation>
        [VA]+(?:C[VA]+)*
    )(?P<object>
        N+(?:CN+)*
    )?(?P<modifiers2>
        ((?:A+(?:CA+)*)(?:N+(?:CN+)*))*
    )
)
```

In [3]:
sentence = '(?:(?:((?:A+(?:[C,]+A+)*)(?:N+(?:[C,]+N+)*)?)*),)?(?:(?:N+(?:[C,]+N+)*)(?:((?:A+(?:[C,]+A+)*)(?:N+(?:[C,]+N+)*))*)(?:[VA]*V+[VA]*(?:[C,]+[VA]*V+[VA]*)*)(?:N+(?:[C,]+N+)*)?(?:((?:A+(?:[C,]+A+)*)(?:N+(?:[C,]+N+)*))*))'
sentence_with_groups = '(?P<sentence>(?:(?P<modifiers0>((?:A+(?:[C,]+A+)*)(?:N+(?:[C,]+N+)*)?)*),)?(?P<subject>N+(?:[C,]+N+)*)(?P<modifiers1>((?:A+(?:[C,]+A+)*)(?:N+(?:[C,]+N+)*))*)(?P<relation>[VA]*V+[VA]*(?:[C,]+[VA]*V+[VA]*)*)(?P<object>N+(?:[C,]+N+)*)?(?P<modifiers2>((?:A+(?:[C,]+A+)*)(?:N+(?:[C,]+N+)*))*))'
re_sentences = re.compile('(' + sentence + ')([C,]+' + sentence + ')*' )
re_sentence = re.compile(sentence_with_groups)
re_modifiers = re.compile('(?P<modifier>(?P<m_rel>A+(?:[C,]+A+)*)(?P<m_obj>N+(?:[C,]+N+)*))(?P<remaining>(?:A+(?:[C,]+A+)*N+(?:[C,]+N+)*)*)')
re_compound = re.compile('(?P<first>[NVA]+)(?P<remaining>(?:[C,]+[NVA]+)*)')

def split_into_sentences(text):
    dont_split = set(['mr', 'ms', 'mrs', 'etc', 'dr', 'no'])
    sentences = []
    sentence = []
    for c in range(len(text)):
        if text[c] == '.' and c + 2 < len(text) and text[c + 2].lower() != text[c + 2] and ''.join(sentence).split()[-1].lower() not in dont_split:
                sentences.append(''.join(sentence))
                sentence = []
        else:
            sentence.append(text[c])
    sentences.append(''.join(sentence))
    return sentences

def get_sentence_structure(sentence):
    pos_ind = {
        'NOUN': 'N', 
        'PROPN': 'N', 
        'ADJ': 'N', 
        'DET': 'N', 
        'NUM': 'N', 
        'PART': 'N', 
        'PRON': 'N',
        'AUX': 'V', 
        'VERB': 'V', 
        'ADV': 'V',
        'ADP': 'A',
        'CCONJ': 'C', 
        'SCONJ': 'C',
        'COMMA': ',',
        'PUNCT': '',
        'SYM': '',
        'SPACE': '',
        'X': '',
        'INTJ': ''
    }
    sent = nlp(sentence)
    
    pos_tags = []
    tokens = []
    for i in range(len(sent)):
        if ',' in sent[i].text and sent[i].pos_ == 'PUNCT':
            pos_tags.append(pos_ind['COMMA'])
            tokens.append(sent[i].text)

            continue
        if sent[i].pos_ == 'VERB' and sent[i].text[-3:] == 'ing' and i > 0 and sent[i - 1].pos_ != 'AUX':
            pos_tags.append(pos_ind['ADP'])
            tokens.append(sent[i].text)
            continue
        if sent[i].pos_ == 'VERB' and sent[i].text[-2:] == 'ed' and (i == 0 or sent[i - 1].pos_ != 'AUX') and i + 1 < len(sent) and sent[i + 1].pos_ == 'ADP':
            pos_tags.append(pos_ind['ADP'])
            tokens.append(sent[i].text)
            continue
        if sent[i].text.lower() == 'because' and i < len(sent) and sent[i + 1].text.lower() == 'of':
            pos_tags.append('A')
            tokens.append(sent[i].text)
            continue
#         print(sent[i].text, sent[i].pos_)
        if pos_ind[sent[i].pos_]:
            pos_tags.append(pos_ind[sent[i].pos_])
            tokens.append(sent[i].text)
    
    tokens = [tokens[i] for i in range(len(tokens)) if pos_tags[i] not in ['C', ',']]
    sentence_structure = []
    i = 0
    while i < len(pos_tags):
        if not pos_tags[i] in [pos_ind['COMMA'], pos_ind['CCONJ'], pos_ind['SCONJ']] or i + 1 < len(pos_tags) and not pos_tags[i + 1] in [pos_ind['COMMA'], pos_ind['CCONJ'], pos_ind['SCONJ']]:
            sentence_structure.append(pos_tags[i])
        i += 1
    sentence_structure = ''.join(sentence_structure)
    return sentence_structure, tokens

    
def extract(sentence):
    if 'such as' in sentence.lower():
        return []
    
    sentence_structure, tokens = get_sentence_structure(sentence)
    extractions= find_match(sentence_structure)

    ind = 0
    for sentence in extractions:
        for modifier in sentence['modifiers0']:
            for i in range(len(modifier['m_rel'])):
                rel = modifier['m_rel'][i]
                modifier['m_rel'][i] = ' '.join(tokens[ind: ind + len(rel)])
                ind += len(rel)
            for i in range(len(modifier['m_obj'])):
                obj = modifier['m_obj'][i]
                modifier['m_obj'][i] = ' '.join(tokens[ind: ind + len(obj)])
                ind += len(obj)


        for i in range(len(sentence['subject'])):
            sub = sentence['subject'][i]
            sentence['subject'][i] = ' '.join(tokens[ind: ind + len(sub)])
            ind += len(sub)

        for modifier in sentence['modifiers1']:
            for i in range(len(modifier['m_rel'])):
                rel = modifier['m_rel'][i]
                modifier['m_rel'][i] = ' '.join(tokens[ind: ind + len(rel)])
                ind += len(rel)
            for i in range(len(modifier['m_obj'])):
                obj = modifier['m_obj'][i]
                modifier['m_obj'][i] = ' '.join(tokens[ind: ind + len(obj)])
                ind += len(obj)

        for i in range(len(sentence['relation'])):
            rel = sentence['relation'][i]
            sentence['relation'][i] = ' '.join(tokens[ind: ind + len(rel)])
            ind += len(rel)

        for i in range(len(sentence['object'])):
            obj = sentence['object'][i]
            sentence['object'][i] = ' '.join(tokens[ind: ind + len(obj)])
            ind += len(obj)

        for modifier in sentence['modifiers2']:
            for i in range(len(modifier['m_rel'])):
                rel = modifier['m_rel'][i]
                modifier['m_rel'][i] = ' '.join(tokens[ind: ind + len(rel)])
                ind += len(rel)
            for i in range(len(modifier['m_obj'])):
                obj = modifier['m_obj'][i]
                modifier['m_obj'][i] = ' '.join(tokens[ind: ind + len(obj)])
                ind += len(obj)

    return extractions

def find_match(text):
    if not text:
        return []
    match = re_sentences.fullmatch(text)
    if not match:
        return []
    sentences = find_match(match.group(0)[match.end(1) + 1:])
    return [break_sentence(match.group(1))] + sentences

def break_sentence(sentence):
    match = re_sentence.fullmatch(sentence)
    extractions = match.groupdict()
    extractions['subject'] = break_compound(extractions['subject'])
    extractions['relation'] = break_compound(extractions['relation'])
    extractions['object'] = break_compound(extractions['object'])
    extractions['modifiers0'] = break_modifiers(extractions['modifiers0'])
    extractions['modifiers1'] = break_modifiers(extractions['modifiers1'])
    extractions['modifiers2'] = break_modifiers(extractions['modifiers2'])

    return extractions

def break_modifiers(modifiers):
    if not modifiers:
        return []
    match = re_modifiers.fullmatch(modifiers)
    if not match:
        return []
    modifier = {
        'm_rel': break_compound(match.group('m_rel')),
        'm_obj':break_compound( match.group('m_obj'))
    }
    modifiers = break_modifiers(match.group('remaining'))
    return [modifier] + modifiers

def break_compound(compound):
    if not compound:
        return []
    match = re_compound.fullmatch(compound)
    if not match:
        return []
    parts = break_compound(match.group('remaining')[1:])
    return [match.group('first')] + parts


In [16]:
print(*get_sentence_structure('dog\'s food is good'))

NNNVN ['dog', "'s", 'food', 'is', 'good']


In [10]:
paragraph = 'Along with designing the non-academic calendar of the institute in consultation with the Dean of Student Affairs, the Student Council manages existing clubs through budgets and annual reports'
extractions = extract(paragraph)
write_json({paragraph: extractions}, '../data/demo/output.json')

In [17]:
print(*get_sentence_structure('Courier boys will enter through gate no 1 and after parking courier boys vehicle in main parking courier boys shall be directed to security guard at gf old building or reception for delivery of courier.'))
print(*extract('Courier boys will enter through gate no 1 and after parking courier boys vehicle in main parking courier boys shall be directed to security guard at gf old building or reception for delivery of courier.'))
print(*extract(' No courier boy will be allowed access inside building'))
print(*extract('because of this, I eat icecream, milkshake, and apple.'))
print(*extract('I did this because I did that'))
print(*extract('I did this because of that'))
print(*extract('The U.S. president Barack Obama gave his speech on Tuesday to thousands of people.'))
print(*extract('After graduating, he became a civil rights attorney and an academic, teaching constitutional law at the University of Chicago Law School from 1992 to 2004'))

NNVVANNNCANNNNANNNNVVVANNANNNCNANAN ['Courier', 'boys', 'will', 'enter', 'through', 'gate', 'no', '1', 'after', 'parking', 'courier', 'boys', 'vehicle', 'in', 'main', 'parking', 'courier', 'boys', 'shall', 'be', 'directed', 'to', 'security', 'guard', 'at', 'gf', 'old', 'building', 'reception', 'for', 'delivery', 'of', 'courier']

{'sentence': 'NNNVVVNAN', 'modifiers0': [], 'subject': ['No courier boy'], 'modifiers1': [], 'relation': ['will be allowed'], 'object': ['access'], 'modifiers2': [{'m_rel': ['inside'], 'm_obj': ['building']}]}
{'sentence': 'AAN,NVN,NCN', 'modifiers0': [{'m_rel': ['because of'], 'm_obj': ['this']}], 'subject': ['I'], 'modifiers1': [], 'relation': ['eat'], 'object': ['icecream', 'milkshake', 'apple'], 'modifiers2': []}
{'sentence': 'NVN', 'modifiers0': [], 'subject': ['I'], 'modifiers1': [], 'relation': ['did'], 'object': ['this'], 'modifiers2': []} {'sentence': 'NVN', 'modifiers0': [], 'subject': ['I'], 'modifiers1': [], 'relation': ['did'], 'object': ['that'],

In [18]:
text = read_text('../data/handbook_preprocessed_text.txt')
# text = read_text('../data/wiki_sentences.txt')


sentences_ = text.split('\n')[:-1]
sentences = []
for sentence in sentences_:
    sentences += split_into_sentences(sentence)

count = 0
extractions = {}
short_sentences = []
for sentence in sentences:
    if len(sentence.split()) <= 4:
        short_sentences.append(sentence)
#     print(sentence)
    ext = extract(sentence)
#     print(ext)
#     print()
    if ext:
        extractions[sentence] = ext
        count += 1
print(count, len(sentences))
print(len(short_sentences))
write_json(extractions, '../data/my_handbook_extractions.json')

167 904
242


In [19]:
print(short_sentences)

['STUDENT', 'HANDBOOK 2017', 'Foreword', '3', 'Index', '5', 'About IIITD', '7', 'Campus and Infrastructure', 'Some Key Features', 'Energy', 'Waste Water Recycling', 'Fire-Fighting Equipment', 'Rainwater and Landscape', '9', 'Internet', 'The Campus', ' The Campus', 'Library and Information Center', '10', 'Neighbourhood', '11', 'Life at IIITD', 'Clubs', ' Clubs', ' Clubs', 'Sports and Recreation', '13', 'Gym', 'Fests and Events', '14', 'Community Work', 'and Self Growth', 'Counselling', '15', 'Mr. Khushpinder P', ' Sharma', '+91-9815181252 khushpinder@iiitd.ac.in', 'Dr. Amita Puri', '+91-7838732232 amitapuri@iiitd.ac.in', 'Dr. Akshay Kumar', '+91-9999801130 akshay@iiitd.ac.in', 'Attend orientations', 'Yes its important', 'Be money Wise', '16', 'Get Organized', 'Go to class', 'Priortising tasks', ' Learn to honor deadlines.', 'Handling homesickness', 'One last word', '.', '17', '19', 'Hostel Allotment', "Students' Guests", '20', '21', 'Other Facilities', 'Medical Facilities', '23', 'Photo

## Example Extractions

In [20]:
get_sentence_structure( 'Based on the specified criteria, the allotment committee recommends the allotment and the list is published on the website with instructions for necessary payments & possession.')

('AANVN,NNNVNNCNNVVANNANANNCN',
 ['Based',
  'on',
  'the',
  'specified',
  'criteria',
  'the',
  'allotment',
  'committee',
  'recommends',
  'the',
  'allotment',
  'the',
  'list',
  'is',
  'published',
  'on',
  'the',
  'website',
  'with',
  'instructions',
  'for',
  'necessary',
  'payments',
  'possession'])

In [21]:
extract('Autonomous cars shift insurance liability and moral responsibility toward manufacturers')

[{'sentence': 'NNVNNCNNAN',
  'modifiers0': [],
  'subject': ['Autonomous cars'],
  'modifiers1': [],
  'relation': ['shift'],
  'object': ['insurance liability', 'moral responsibility'],
  'modifiers2': [{'m_rel': ['toward'], 'm_obj': ['manufacturers']}]}]

In [25]:
get_sentence_structure('Rohit and Rahul eat and drink snacks and drinks and they study together.')

('NCNVCVNCNCNVV',
 ['Rohit',
  'Rahul',
  'eat',
  'drink',
  'snacks',
  'drinks',
  'they',
  'study',
  'together'])

In [26]:
print(extract('Visitor entry to the campus is allowed from 8 am to 10 pm through Gate No 1.'))
get_sentence_structure('Visitor entry to the campus is allowed from 8 am to 10 pm through Gate No 1.')

[]


('NNANNVVANVANNANNN',
 ['Visitor',
  'entry',
  'to',
  'the',
  'campus',
  'is',
  'allowed',
  'from',
  '8',
  'am',
  'to',
  '10',
  'pm',
  'through',
  'Gate',
  'No',
  '1'])

In [31]:
extract('All visitors entering campus are under CCTV surveillance and the number plate is recorded by the camera.')

[{'sentence': 'NNANVANN',
  'modifiers0': [],
  'subject': ['All visitors'],
  'modifiers1': [{'m_rel': ['entering'], 'm_obj': ['campus']}],
  'relation': ['are under'],
  'object': ['CCTV surveillance'],
  'modifiers2': []},
 {'sentence': 'NNNVVANN',
  'modifiers0': [],
  'subject': ['the number plate'],
  'modifiers1': [],
  'relation': ['is recorded by'],
  'object': ['the camera'],
  'modifiers2': []}]

In [29]:
extract('Any visitor coming to the building complex can gain entry through Gate No 1 only after confirmation from the staff members whom the visitor intends to meet or in the case of non-availability of the member, a confirmation has to be obtained from GM(Ops).')

[]

In [30]:
extract('After obtaining due clearance for the access, as mentioned above, the visitors coming in self-driven cars will be requested to park their car in the Parking Area near Academic Block.')

[]

In [28]:
extract('The Institute collects tuition and hostels fee from students on semester basis.')

[{'sentence': 'NNVNCNNANANN',
  'modifiers0': [],
  'subject': ['The Institute'],
  'modifiers1': [],
  'relation': ['collects'],
  'object': ['tuition', 'hostels fee'],
  'modifiers2': [{'m_rel': ['from'], 'm_obj': ['students']},
   {'m_rel': ['on'], 'm_obj': ['semester basis']}]}]

In [27]:
extract('Purchase followed by billing will be done at the counter.')

[{'sentence': 'NAANVVVANN',
  'modifiers0': [],
  'subject': ['Purchase'],
  'modifiers1': [{'m_rel': ['followed by'], 'm_obj': ['billing']}],
  'relation': ['will be done at'],
  'object': ['the counter'],
  'modifiers2': []}]

# Canonicalise extractions

In [4]:
extractions = read_json('../data/my_handbook_extractions.json')

words = []
relations = []
for sentence in extractions:
    for extraction in extractions[sentence]:
        for sub in extraction['subject']:
            words.append(sub)
        for obj in extraction['object']:
            words.append(obj)
        for rel in extraction['relation']:
            relations.append(rel)
        for m in extraction['modifiers0']:
            for obj in m['m_obj']:
                words.append(obj)
            for rel in m['m_rel']:
                relations.append(rel)
        for m in extraction['modifiers1']:
            for obj in m['m_obj']:
                words.append(obj)
            for rel in m['m_rel']:
                relations.append(rel)
        for m in extraction['modifiers2']:
            for obj in m['m_obj']:
                words.append(obj)
            for rel in m['m_rel']:
                relations.append(rel)


print(len(words))
words = set(words)
print(len(words))

words = set(word.lower() for word in words)
print(len(words))


723
608
591


## The following entities need to be detected:
* Locations: `gate`, `hostel` etc
* Links: `https://`, emails
* Numbers:
    * Time: `hrs`, `date`, `am`, `pm`, etc
    * Money `rs`, `rs.`, etc
* Person:
    * student: btech, mtech, phd, hosteller, day-scholar
    * faculty: 
    * staff
    * visitor

In [1]:
def get_type(entity):
    if '\'s' in entity:
        entity = entity[entity.find('\'s'):]
    if '’s' in entity:
        entity = entity[entity.find('’s'):]


    # links
    if 'http' in entity:
        return 'ent:link:web'
    if '@' in entity and ('.com' in entity or '.in'):
        return 'ent:link:mail'
    
    # committee check
    if 'committee' in entity or 'council' in entity or 'senate' in entity:
        return 'ent:committee'

    # location check
    if 'gate' in entity:
        if '1' in entity:
            return 'ent:location:gate:1'
        if '2' in entity:
            return 'ent:location:gate:2'
        if '3' in entity:
            return 'ent:location:gate:3'
        return 'ent:location:gate'
    if 'hostel' in entity:
        return 'ent:location:building:hostel'
    if 'library' in entity:
        return 'ent:location:building:library'
    if 'sport' in entity and 'complex' in entity:
        return 'ent:location:building:sports_block'
    if 'canteen' in entity or 'mess' in entity or 'dining' in entity:
        return 'entity:location:building:canteen'
    if 'block' in entity:
        if 'sem' in entity:
            return 'ent:location:building:seminar_block'
        if 'new' in entity and 'acad' in entity or 'r&d' in entity or 'research' in entity:
            return 'ent:location:building:new_academic_block'
        if 'acad' in entity:
            return 'ent:location:building:acad_block'
        if 'sport' in entity:
            return 'ent:location:building:sports_block'
        return 'ent:location:building'
    if 'building' in entity or 'room' in entity or 'floor' in entity:
        return 'ent:location'
    
    # concept check
    if 'admission' in entity or 'admit' in entity:
        return 'ent:topic:admission'
    if 'park' in entity or 'vehicle' in entity:
        return 'ent:topic:parking'
    if 'course' in entity or 'academic' in entity or 'lectur' in entity:
        return 'ent:topic:academic'
    if 'fee' in entity or 'pay' in entity or 'charge' in entity:
        return 'ent:topic:fee'
    if 'concern' in entity or 'grievance' in entity or 'problem' in entity or 'issue' in entity:
        return 'ent:topic:concerns'
    
    # person checks
    if 'prof' in entity or 'faculty' in entity:
        return 'ent:person:faculty'

    if 'staff' in entity:
        return 'ent:person:staff'

    if 'student' in entity:
        if 'btech' in entity or 'b.tech' in entity:
            return 'ent:person:student:btech'
        if 'mtech' in entity or 'm.tech' in entity:
            return 'ent:person:student:mtech'
        if 'phd' in entity:
            return 'ent:person:student:phd'
        if 'hostel' in entity:
            return 'ent:person:student:hosteller'
        return 'ent:person:student'
    if 'hosteller' in entity:
        return 'ent:person:hosteller'
    if 'visitor' in entity:
        return 'ent:person:visiter'


    # check if it contains a number
    isNumber = False
    for char in entity:
        if char in '1234567890':
            isNumber = True
    if isNumber:
        # date
        months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
        for month in months:
            if month in entity:
                return 'ent:number:date'
        if re.match(entity, '*20..-*') or re.match(entity, '*19..-*') or re.match(entity, '*-20..*') or re.match(entity, '*/20..*'):
            return 'ent:number:date'
        # time
        if 'day' in entity or 'hr' in entity or 'mins' in entity or 'minute' in entity or 'sec' in entity or 'am' in entity or 'pm' in entity:
            return 'ent:number:time'
        if 'rs' in entity:
            return 'ent:number:money'
        return 'ent:number'

    # program check
    if 'b.tech' in entity or 'btech' in entity or 'bachelor' in entity:
        return 'ent:program:btech'
    if 'm.tech' in entity or 'mtech' in entity or 'master' in entity:
        return 'ent:program:mtech'
    if 'phd' in entity:
        return 'ent:program:phd'
    if 'program' in entity or 'department' in entity:
        return 'ent:program'

   # org check
    if 'iiit' in entity or 'campus' in entity or 'institute' in entity or 'college' in entity:
        return 'ent:iiitd'
    
    return 'ent:other'

In [149]:
entity_type = {}
for word in words:
    if get_type(word) != 'ent:other':
        entity_type[word] = get_type(word)
#     else:
#         print(word)
print()
# for entity in entity_type:
#     print(entity, ':', entity_type[entity])

print(len(words), len(words) - len(entity_type))

groups = {}
for entity in entity_type:
    if entity_type[entity] not in groups:
        groups[entity_type[entity]] = []
    groups[entity_type[entity]].append(entity)
reduction = 0
for group in groups:
    reduction += (len(groups[group]) - 1)

for group in groups:
    print(group)
    print(groups[group])
    print()


668 484
ent:program:mtech
['m.tech cb program', 'm.tech ece program', 'pg m.tech', 'm.tech labs', 'm.tech cse program']

ent:committee
['disciplinary committee', 'employment 36 anti sexual harrassment committee 40 grievance redressal', 'student senate', 'the student senate coordinator', 'student council', 'mess committee mess committee', 'institute ’s senate']

ent:program:btech
['b.tech ece program', 'b.tech', 'b.tech cse program', 'b.tech csd program', 'b.tech labs', 'btech scholarship', 'b.tech csam program', 'b.tech itss program']

ent:topic:fee
['the subsidized charges', 'one time payment which', 'payment basis', 'caution fees rs . 10', 'tuition fees rs . 2', 'charge']

entity:location:building:canteen
['dining block', 'mess charges', 'common mess', 'dining block houses gymnasium', 'mess']

ent:person:student
['under- graduate students', 'student leaves institute', 'students', 'students area', 'student life', 'new students', 'fresh students', 'iiit delhi student', 'delhi students

In [None]:
for word in word

In [162]:
extractions = read_json('../data/my_handbook_extractions.json')
rels = []
for sentence in extractions:
    for extraction in extractions[sentence]:
        for rel in extraction['relation']:
            rels.append(rel)
        for modifier in extraction['modifiers0'] + extraction['modifiers1'] + extraction['modifiers2']:
            for rel in modifier['m_rel']:
                rels.append(rel)
print(len(rels))
rels = set(rels)
print(len(rels))
print(rels)

453
204
{'Help', 'for', 'has', 'are based on', 'inside', 'are connected through', 'can get', 'is approved by', 'are there for', 'are strictly prohibited', 'by', 'may be between', 'to', 'containing', 'is as', 'may go', 'functions during', 'may be contacted in', 'is very', 'are elected', 'by dialling', 'was hugely', 'is provided in', 'IN', 'has been contracted for providing', 'is about trying', 'is recycled using', 'will be deactivated from', 'safeguard', 'will be sent to', 'may hire', 'gets', 'was officially established on', 'as much', 'currently has', 'about putting in', 'also form', 'is always', 'done', 'can', 'know', 'are planned together by', 'are given in', 'are monitored by', 'will help ensure', 'during', 'improve', 'shall do so from', 'jointly held by', 'are organizing', 'are provided', 'allow', 'has been opened on', 'was created', 'will be referred to', 'shall be done via', 'began with', 'is', 'taking', 'is equipped with', 'may also visit', 'contact for', 'constitute', 'of using

In [21]:
extract('Chairman’s Merit Scholarship in BTech admission for top students')

[]