In [1]:
import json
from collections import Counter
from pprint import pprint

import spacy
from spacy.matcher import Matcher
from spacy import displacy

%load_ext autoreload
%autoreload 2

In [2]:
nlp = spacy.load("en_core_web_sm")

## Read data

In [3]:
with open('classes.json') as f:
    data = json.load(f)

In [4]:
len(data)

70

In [5]:
pprint(data[0])

{'description': 'The course introduces students to some fundamentals of '
                'research methodology, and gives students first hand '
                'experience by having them carry out a small research project, '
                'under close supervision by a member of academic staff. The '
                'course comprises a series of lectures which cover the '
                'following topics: how to define a research problem; writing a '
                'research paper and report; how to give a seminar; the use of '
                'search tools and databases to find relevant literature; '
                'scientific methods in practice; design of an experiment. '
                'Students complete a research project which requires them to '
                'carry out background reading and literature review, and to '
                'prepare a research report and give a seminar at the end of '
                'the course.',
 'id': 'COMP6445',
 'n_units': 6,
 'name': 'A

In [6]:
txt_data = []
for item in data:
    doc = nlp(" ".join(item['requisites']))
    for sent in doc.sents:
        txt_data += sent,

In [7]:
type(txt_data[0])

spacy.tokens.span.Span

## NER Out-of-box performance

In [8]:
# s = "To enrol in this course you must have completed COMP7240 or COMP6240 or COMP2400; and COMP6730 or COMP7230 or COMP6710. You are not able to enrol in this course if you completed COMP3420 or COMP3425 or COMP8400"
s = "To enrol in this course you must be studying a Master of Engineering or Master of Energy Change or Master of Energy Change (Research). Incompatible with ENGN3516."
nlp = spacy.load("en_core_web_sm")
doc = nlp(s)

In [9]:
sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style="dep")

In [10]:
displacy.render(sentence_spans, style="ent")



In [11]:
for token in doc:
    print(token.text, token.dep_, token.pos, token.head.text, token.head.pos_, [child for child in token.children])

To aux 94 enrol VERB []
enrol advcl 100 studying VERB [To, in]
in prep 85 enrol VERB [course]
this det 90 course NOUN []
course pobj 92 in ADP [this]
you nsubj 95 studying VERB []
must aux 87 studying VERB []
be aux 87 studying VERB []
studying ROOT 100 studying VERB [enrol, you, must, be, Master, Change, .]
a det 90 Master PROPN []
Master dobj 96 studying VERB [a, of, or, Master]
of prep 85 Master PROPN [Engineering]
Engineering pobj 96 of ADP []
or cc 89 Master PROPN []
Master conj 96 Master PROPN [of, or, Master]
of prep 85 Master PROPN [Change]
Energy compound 96 Change PROPN []
Change pobj 96 of ADP [Energy]
or cc 89 Master PROPN []
Master conj 96 Master PROPN [of]
of prep 85 Master PROPN [Energy]
Energy pobj 96 of ADP []
Change dobj 96 studying VERB [(, Research, )]
( punct 97 Change PROPN []
Research appos 96 Change PROPN []
) punct 97 Change PROPN []
. punct 97 studying VERB []
Incompatible ROOT 84 Incompatible ADJ [with, .]
with prep 85 Incompatible ADJ [ENGN3516]
ENGN3516 pob

In [12]:
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

this course course pobj in
you you nsubj studying
a Master Master dobj studying
Engineering Engineering pobj of
Master Master conj Master
Energy Change Change pobj of
Master Master conj Master
Energy Energy pobj of
Change Change dobj studying
Research Research appos Change
ENGN3516 ENGN3516 pobj with


## EntityRuler

    "To enrol in this course you must be studying Master of Computing or have successfully completed COMP6700 or COMP6710. You are not able to enrol in this course if you have successfully completed COMP3600."
    "To enrol in this course you must have completed COMP6320; and be studying COMP7705 or COMP7706. Incompatible with COMP4620."
    "To enrol in this course you must have completed COMP7240 or COMP6240 or COMP2400; and COMP6730 or COMP7230 or COMP6710."
    "You are not able to enrol in this course if you completed COMP3420 or COMP3425 or COMP8400
    "To enrol in this course you must have completed COMP6710, and have completed or be currently enrolled in COMP6262. Incompatible with COMP3620."

In [13]:
from parser import parse_requisites, nlp

In [14]:
s = "To enrol in this course you must be studying Master of Computing or have successfully completed COMP6700 or COMP6710. You are not able to enrol in this course if you have successfully completed COMP3600."
doc = nlp(s)

displacy.render(list(doc.sents), style="ent")

In [15]:
displacy.render(list(doc.sents), style="dep")

In [16]:
displacy.render(list(doc.sents), style="ent")

In [92]:
"hello \\- my name is".replace('\\', '')

'hello - my name is'

In [17]:
doc.ents

(Master of Computing, COMP6700, COMP6710, COMP3600)

In [20]:
for ent in doc.ents:
    break

In [22]:
doc[2:5].text

'in this course'

In [18]:
reqs = parse_requisites(doc)

In [24]:
pprint(reqs, indent=4)

[   {   'description': 'To enrol in this course you must be studying Master of '
                       'Computing or have successfully completed COMP6700 or '
                       'COMP6710.',
        'operator': {   'OR': [   {   'classes': ['Master', 'of', 'Computing'],
                                      'condition': 'studying',
                                      'description': 'To enrol in this course '
                                                     'you must be studying '
                                                     'Master of Computing',
                                      'negation': False,
                                      'operator': 'N/A',
                                      'programs': []},
                                  {   'classes': ['COMP6700', 'COMP6710'],
                                      'condition': 'completed',
                                      'description': 'have successfully '
                                              

In [246]:
len(reqs)

1

In [39]:
for token in doc:
    if token.text == "COMP6700":
        break

In [41]:
for sent in doc.sents:
    break

In [43]:
sent.ents

[Master of Computing, COMP6700, COMP6710]

In [49]:
sent[:-3].ents

[Master of Computing, COMP6700]

In [44]:
for ent in sent.ents:
    break

In [46]:
ent.label_

'PROGRAM'

In [28]:
token.ent_type_

'CLASS'

In [37]:
sent

Incompatible with ENGN4536.

In [264]:
token.pos_

'VERB'

In [235]:
token.text.upper()

'AND'

In [207]:
token.head

completed

In [212]:
token.head.ent_type_

''

    "To enrol in this course you must be studying a Master of Engineering, and have completed ENGN3223 or ENGN6223 or an equivalent course in control systems."
    "To enrol in this course you must have completed COMP6710, and have completed or be currently enrolled in COMP6262."
    "To enrol in this course you must have completed COMP6670 OR (6 units of (either COMP6710 or COMP6730 or COMP7230) and COMP8410 and STAT6039)."
    "To enrol in this course you must be studying a Master of Computing or completed or currently studying COMP6700. Incompatible with COMP2300, ENGN2219 and COMP6719."

In [90]:
samples = ["To enrol in this course you must have successfully completed or be currently studying COMP6442. Incompatible with COMP2120, COMP2130 and COMP6311.",
           "To enrol in this course you must be enrolled in or completed COMP6442. Incompatible with COMP2120, COMP2130 and COMP6311.",
           "To enrol in this course you must have completed COMP6442. Incompatible with COMP2120, COMP2130 and COMP6311.",
           "To enrol in this course you must be studying a Master of Computing or completed or currently studying COMP6700. Incompatible with COMP2300, ENGN2219 and COMP6719."]

for s in samples:
    doc = nlp(s)
    for idx, token in enumerate(doc, 1):
        if token.pos_ != 'ADV':
            print(token, end=" ")
            # print(token, token.pos_, end=", ")
        # if idx % 8 == 0:
            # print("")
    print("\n")

To enrol in this course you must have completed or be studying COMP6442 . Incompatible with COMP2120 , COMP2130 and COMP6311 . 

To enrol in this course you must be enrolled in or completed COMP6442 . Incompatible with COMP2120 , COMP2130 and COMP6311 . 

To enrol in this course you must have completed COMP6442 . Incompatible with COMP2120 , COMP2130 and COMP6311 . 

To enrol in this course you must be studying a Master of Computing or completed or studying COMP6700 . Incompatible with COMP2300 , ENGN2219 and COMP6719 . 



[To,
 enrol,
 in,
 this,
 course,
 you,
 must,
 have,
 completed,
 or,
 be,
 studying,
 COMP6442,
 .,
 Incompatible,
 with,
 COMP2120,
 ,,
 COMP2130,
 and,
 COMP6311,
 .]

In [65]:
displacy.render(list(doc.sents), style="dep")

In [66]:
displacy.render(list(doc.sents), style="ent")

In [67]:
[token.lemma_ for token in doc if not token.is_stop]

['enrol',
 'course',
 'complete',
 'comp6442',
 '.',
 'incompatible',
 'comp2120',
 ',',
 'COMP2130',
 'COMP6311',
 '.']

In [38]:
token.is_punct

True

In [33]:
[t.text for t in token.lefts]

[]

In [34]:
[t.text for t in token.rights]

[]

In [35]:
token.i

10

In [36]:
doc[10]

OR

In [40]:
ent.label_

'PROGRAM'

In [43]:
list(range(0, 13, 3))[:3]

[0, 3, 6]