# Basic Preprocessing

Case Folding - lower or upper casing all tokens - can catch duplicate tokens where single difference is upper case - can result in information loss EG: the name 'Cook' may be lost and combined with the verb 'cook'

Stop word removal - words that occur frequently but carry little information
eg: the, a, of, this, that etc
removal again depends on the task at hand eg: Sentiment analysis would negatively effect sentiment where emphasis on articles influences sentiment
eg: I need a house - I need the house -> article necessary for full understanding


In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
statement = "He told Dr.Lovato that he was done with the tests and would post the results shortly."
doc = nlp(statement)

In [5]:
print([t.lower_ for t in doc])

['he', 'told', 'dr.', 'lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'tests', 'and', 'would', 'post', 'the', 'results', 'shortly', '.']


In [7]:
# apply conditional to ignore case-folding at beginning of sentence
print([t.lower_ if not t.is_sent_start else t for t in doc])

[He, 'told', 'dr.', 'lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'tests', 'and', 'would', 'post', 'the', 'results', 'shortly', '.']


In [9]:
# Default stop words within en_core_web_sm
print(nlp.Defaults.stop_words)

{'at', 'ten', 'part', 'you', 'nothing', 'her', 'well', 'it', 'none', 'own', 'down', 'due', 'other', 'him', 'quite', 'ever', 'over', 'whatever', 'until', 'whom', 'three', 'give', 'already', 'without', 'above', 'however', 'nevertheless', 'mostly', 'thence', 'namely', 'against', "'ve", 'n‘t', 'make', 'who', 'whole', 'somewhere', 'us', 'my', 'keep', 'only', 'both', 'off', '‘s', 'across', 'from', 'up', 'latter', 'hence', 'somehow', 'five', 'seeming', 'them', 'fifteen', '‘m', 'same', 'anything', 'forty', 'thereupon', 'hereafter', 'even', 'whose', 'full', 'top', 'along', 'any', 'herein', 'she', 'whereupon', 'show', 'thru', 'next', 'ca', 'could', '‘ll', 'becoming', 'seem', 'anyone', 'these', 'besides', '‘d', 'please', 'still', 'six', 'serious', 'have', 'beside', 'there', 'beforehand', 'themselves', 'those', 'seems', 'rather', 'more', 'else', 'very', 'fifty', 'per', 'two', 'hereby', 'sixty', 'do', 'amongst', 'eleven', 'others', "'s", 'whence', 'cannot', 'every', 'four', 'several', '’s', 'everyt

In [12]:
# isolate words that are not stop words ergo unique
print([t for t in doc if not t.is_stop])

[told, Dr., Lovato, tests, post, results, shortly, .]


### Lemmatization
 This is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.

In [14]:
# print out the lematization
[(t.text, t.lemma_) for t in doc]

[('He', 'he'),
 ('told', 'tell'),
 ('Dr.', 'Dr.'),
 ('Lovato', 'Lovato'),
 ('that', 'that'),
 ('he', 'he'),
 ('was', 'be'),
 ('done', 'do'),
 ('with', 'with'),
 ('the', 'the'),
 ('tests', 'test'),
 ('and', 'and'),
 ('would', 'would'),
 ('post', 'post'),
 ('the', 'the'),
 ('results', 'result'),
 ('shortly', 'shortly'),
 ('.', '.')]

### Advanced Preprocessing

Part-of-Speech Tagging, Named Entity Recognition, and Parsing

In [16]:
# Part of Speech tagging can be accessed via the pos_ attribute
[(t.text, t.pos_) for t in doc]

[('He', 'PRON'),
 ('told', 'VERB'),
 ('Dr.', 'PROPN'),
 ('Lovato', 'PROPN'),
 ('that', 'SCONJ'),
 ('he', 'PRON'),
 ('was', 'AUX'),
 ('done', 'VERB'),
 ('with', 'ADP'),
 ('the', 'DET'),
 ('tests', 'NOUN'),
 ('and', 'CCONJ'),
 ('would', 'AUX'),
 ('post', 'VERB'),
 ('the', 'DET'),
 ('results', 'NOUN'),
 ('shortly', 'ADV'),
 ('.', 'PUNCT')]

In [22]:
# describe the POS tag via .explain() - eg 'PRON' = pronoun
spacy.explain('PRON')

for t in doc:
    print("'{text}' : '{explain}'".format(text=t.text, explain=spacy.explain(t.pos_)))

'He' : 'pronoun'
'told' : 'verb'
'Dr.' : 'proper noun'
'Lovato' : 'proper noun'
'that' : 'subordinating conjunction'
'he' : 'pronoun'
'was' : 'auxiliary'
'done' : 'verb'
'with' : 'adposition'
'the' : 'determiner'
'tests' : 'noun'
'and' : 'coordinating conjunction'
'would' : 'auxiliary'
'post' : 'verb'
'the' : 'determiner'
'results' : 'noun'
'shortly' : 'adverb'
'.' : 'punctuation'


In [24]:
# POS tag above are course-grained tags. We can access fine-grained tags via tag_ attribute, which provide more detailed information
[(t.text, t.tag_) for t in doc]

[('He', 'PRP'),
 ('told', 'VBD'),
 ('Dr.', 'NNP'),
 ('Lovato', 'NNP'),
 ('that', 'IN'),
 ('he', 'PRP'),
 ('was', 'VBD'),
 ('done', 'VBN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('tests', 'NNS'),
 ('and', 'CC'),
 ('would', 'MD'),
 ('post', 'VB'),
 ('the', 'DT'),
 ('results', 'NNS'),
 ('shortly', 'RB'),
 ('.', '.')]

In [25]:
for t in doc:
    print("'{text}' : '{explain}'".format(text=t.text, explain=spacy.explain(t.tag_)))

'He' : 'pronoun, personal'
'told' : 'verb, past tense'
'Dr.' : 'noun, proper singular'
'Lovato' : 'noun, proper singular'
'that' : 'conjunction, subordinating or preposition'
'he' : 'pronoun, personal'
'was' : 'verb, past tense'
'done' : 'verb, past participle'
'with' : 'conjunction, subordinating or preposition'
'the' : 'determiner'
'tests' : 'noun, plural'
'and' : 'conjunction, coordinating'
'would' : 'verb, modal auxiliary'
'post' : 'verb, base form'
'the' : 'determiner'
'results' : 'noun, plural'
'shortly' : 'adverb'
'.' : 'punctuation mark, sentence closer'


### Named Entity Recognition
There are multiple ways to access Named Entities, one is via ent_type_

In [28]:
statement2 = "Volkswagen is developing an electric sedan which could potentially come to America next fall."
document = nlp(statement2)
[(t.text, t.ent_type_) for t in document]

[('Volkswagen', 'ORG'),
 ('is', ''),
 ('developing', ''),
 ('an', ''),
 ('electric', ''),
 ('sedan', ''),
 ('which', ''),
 ('could', ''),
 ('potentially', ''),
 ('come', ''),
 ('to', ''),
 ('America', 'GPE'),
 ('next', 'DATE'),
 ('fall', 'DATE'),
 ('.', '')]

In [39]:
for t in document:
    if t.ent_type_:
        print("'{text}' : '{explain}'".format(text=t.text, explain=spacy.explain(t.ent_type_)))

'Volkswagen' : 'Companies, agencies, institutions, etc.'
'America' : 'Countries, cities, states'
'next' : 'Absolute or relative dates or periods'
'fall' : 'Absolute or relative dates or periods'


In [42]:
# another method to check a Named Entity is via accessing the entities themselves from the document
for ent in document.ents:
    if ent.label_:
        print("'{text}' : '{explain}'".format(text=ent.text, explain=spacy.explain(ent.label_)))

'Volkswagen' : 'Companies, agencies, institutions, etc.'
'America' : 'Countries, cities, states'
'next fall' : 'Absolute or relative dates or periods'


In [44]:
# can also access the position of entities
print([(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in document.ents])

[('Volkswagen', 'ORG', 0, 10), ('America', 'GPE', 75, 82), ('next fall', 'DATE', 83, 92)]


## Visualisers 
We can use visualizers for both parsing and named entities

In [45]:
from spacy import displacy

In [46]:
displacy.render(document, style='ent', jupyter=True)

In [47]:
unidoc = nlp("She enrolled in the course at the university")

displacy.render(unidoc, style='dep', jupyter=True)

In [49]:
# dependency information can be accessed via the .dep_ attribute
for t in unidoc:
    if t.dep_:
        print("'{text}' : '{explain}'".format(text=t.text, explain=spacy.explain(t.dep_)))

'She' : 'nominal subject'
'enrolled' : 'None'
'in' : 'prepositional modifier'
'the' : 'determiner'
'course' : 'object of preposition'
'at' : 'prepositional modifier'
'the' : 'determiner'
'university' : 'object of preposition'
