In [1]:
# https://realpython.com/natural-language-processing-spacy-python/#rule-based-matching-using-spacy
import spacy #NLP library in python
# Load the language model instance in spaCy:
nlp = spacy.load('en_core_web_sm')

In [2]:
#create a processed Doc object, which is a container for accessing linguistic annotations, for a given input string:
introduction_text = ('This tutorial is about Natural Language Processing in Spacy.')
introduction_doc = nlp(introduction_text)
# Extract tokens for the given doc
print ([token.text for token in introduction_doc])

['This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'Spacy', '.']


In [4]:
#create a processed Doc object from a text file:
file_name = "Spacy_Data/example.txt"
file_text = open(file_name).read()
file_doc = nlp(file_text)
# Extract tokens for the given doc
print ([token.text for token in file_doc])

['Mr.', 'Biden', '’s', 'announcement', 'offered', 'a', 'telling', 'split', '-', 'screen', 'counterpoint', 'to', 'an', 'event', 'being', 'held', 'at', 'the', 'same', 'time', 'at', 'the', 'White', 'House', ':', 'a', 'vaccine', 'summit', 'where', 'President', 'Trump', 'boasted', 'about', 'what', 'he', 'called', 'a', '“', 'monumental', 'national', 'achievement', '”', 'by', 'drug', 'companies', 'to', 'develop', 'a', 'vaccine', 'for', 'the', 'virus', 'in', 'about', 'nine', 'months', '.', 'He', 'did', 'not', 'address', 'the', 'growing', 'death', 'toll', 'or', 'the', 'devastation', 'across', 'the', 'country', ',', 'but', 'he', 'used', 'the', 'occasion', 'to', 'suggest', ',', 'yet', 'again', 'and', 'without', 'evidence', ',', 'that', 'people', 'had', 'tried', 'to', '“', 'steal', '”', 'the', 'election', '.']


In [5]:
# Sentence Detection: process of locating the start and end of sentences in a given text; 
# used in tasks such as part of speech tagging or entity extraction
about_text = ('Gus Proto is a Python developer currently' + 
 ' working for a London-based Fintech' + 
 ' company. He is interested in learning'+ 
 ' Natural Language Processing.')
about_doc = nlp(about_text)
# sents property is used to extract sentences: total number of sentences & sentence itself
sentences = list(about_doc.sents)
print(len(sentences))
for sentence in sentences:
    print (sentence)

2
Gus Proto is a Python developer currently working for a London-based Fintech company.
He is interested in learning Natural Language Processing.


In [6]:
# customize the sentence detection to detect sentences on custom delimiters e.g., (...) as the delimeter
def set_custom_boundaries(doc):
     # Adds support to use `...` as the delimiter for sentence detection
     for token in doc[:-1]:
         if token.text == '...':
             doc[token.i+1].is_sent_start = True
     return doc

ellipsis_text = ('Gus, can you, ... never mind, I forgot' +
 ' what I was saying. So, do you think' + 
 ' we should ...')
# Load a new model instance
custom_nlp = spacy.load('en_core_web_sm')
custom_nlp.add_pipe(set_custom_boundaries, before='parser')
custom_ellipsis_doc = custom_nlp(ellipsis_text)
# sents property is used to extract sentences: total number of sentences & sentence itself
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
print("sentence detection with custom delimiters")
for sentence in custom_ellipsis_sentences:
    print(sentence)

print("sentence detection without custom delimiters to see the difference")
ellipsis_doc = nlp(ellipsis_text)
ellipsis_sentences = list(ellipsis_doc.sents)
for sentence in ellipsis_sentences:
    print(sentence)

#custom_ellipsis_sentences contain three sentences, whereas ellipsis_sentences contains two sentences. 
# These sentences are still obtained via the sents attribute

sentence detection with custom delimiters
Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...
sentence detection without custom delimiters to see the difference
Gus, can you, ... never mind, I forgot what I was saying.
So, do you think we should ...


In [7]:
# Tokenization in spacy: next step after sentence detection -- identifies the basic units in text: tokens
# usage: used for further analysis i.e., part of speech tagging
# print tokens by iterating on the Doc object:(token, starting index): useful for in-place word replacement
for token in about_doc:
    print (token, token.idx)

Gus 0
Proto 4
is 10
a 13
Python 15
developer 22
currently 32
working 42
for 50
a 54
London 56
- 62
based 63
Fintech 69
company 77
. 84
He 86
is 89
interested 92
in 103
learning 106
Natural 115
Language 123
Processing 132
. 142


In [8]:
# other attributes
# text_with_ws prints token text with trailing space (if present).
# is_alpha detects if the token consists of alphabetic characters or not.
# is_punct detects if the token is a punctuation symbol or not.
# is_space detects if the token is a space or not.
# shape_ prints out the shape of the word.
# is_stop detects if the token is a stop word or not.

for token in about_doc:
    print (token, token.idx, token.text_with_ws,
        token.is_alpha, token.is_punct, token.is_space,
        token.shape_, token.is_stop)

Gus 0 Gus  True False False Xxx False
Proto 4 Proto  True False False Xxxxx False
is 10 is  True False False xx True
a 13 a  True False False x True
Python 15 Python  True False False Xxxxx False
developer 22 developer  True False False xxxx False
currently 32 currently  True False False xxxx False
working 42 working  True False False xxxx False
for 50 for  True False False xxx True
a 54 a  True False False x True
London 56 London True False False Xxxxx False
- 62 - False True False - False
based 63 based  True False False xxxx False
Fintech 69 Fintech  True False False Xxxxx False
company 77 company True False False xxxx False
. 84 .  False True False . False
He 86 He  True False False Xx True
is 89 is  True False False xx True
interested 92 interested  True False False xxxx False
in 103 in  True False False xx True
learning 106 learning  True False False xxxx False
Natural 115 Natural  True False False Xxxxx False
Language 123 Language  True False False Xxxxx False
Processing 132 Pro

In [9]:
#customize tokenization process to detect tokens on custom characters e.g., hyphenated words London-based
import re
from spacy.tokenizer import Tokenizer
prefix_re = spacy.util.compile_prefix_regex(custom_nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(custom_nlp.Defaults.suffixes)
infix_re = re.compile(r'''[-~]''')

# In order for you to customize, you can pass various parameters to the Tokenizer class:
# nlp.vocab is a storage container for special cases and is used to handle cases like contractions and emoticons.
# prefix_search is the function that is used to handle preceding punctuation, such as opening parentheses.
# infix_finditer is the function that is used to handle non-whitespace separators, such as hyphens.
# suffix_search is the function that is used to handle succeeding punctuation, such as closing parentheses.
# token_match is an optional boolean function that is used to match strings that should never be split. 
# It overrides the previous rules and is useful for entities like URLs or numbers.
def customize_tokenizer(nlp):
    # Adds support to use `-` as the delimiter for tokenization
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
         suffix_search=suffix_re.search,
         infix_finditer=infix_re.finditer,
         token_match=None)

custom_nlp.tokenizer = customize_tokenizer(custom_nlp)
custom_tokenizer_about_doc = custom_nlp(about_text)
print([token.text for token in custom_tokenizer_about_doc])

['Gus', 'Proto', 'is', 'a', 'Python', 'developer', 'currently', 'working', 'for', 'a', 'London', '-', 'based', 'Fintech', 'company', '.', 'He', 'is', 'interested', 'in', 'learning', 'Natural', 'Language', 'Processing', '.']


In [14]:
# spaCy has a list of stop words for the English language:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

for stop_word in list(spacy_stopwords)[:10]:
     print(stop_word)
        
#remove stop words from the input text:
for token in about_doc:
    if not token.is_stop:
        print (token)

#create a list of tokens not containing stop words:
about_no_stopword_doc = [token for token in about_doc if not token.is_stop]
print (about_no_stopword_doc)

#lemmatization: process of reducing words to its original form
conference_help_text = ('Gus is helping organize a developer'+
'conference on Applications of Natural Language'+
' Processing. He keeps organizing local Python meetups'+
' and several internal talks at his workplace.')
conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
    print (token, token.lemma_)

every
else
did
‘m
any
another
however
often
anyone
whenever
Gus
Proto
Python
developer
currently
working
London
-
based
Fintech
company
.
interested
learning
Natural
Language
Processing
.
[Gus, Proto, Python, developer, currently, working, London, -, based, Fintech, company, ., interested, learning, Natural, Language, Processing, .]
Gus Gus
is be
helping help
organize organize
a a
developerconference developerconference
on on
Applications Applications
of of
Natural Natural
Language Language
Processing Processing
. .
He -PRON-
keeps keep
organizing organize
local local
Python Python
meetups meetup
and and
several several
internal internal
talks talk
at at
his -PRON-
workplace workplace
. .


In [18]:
#word Frequency using Spacy
from collections import Counter
complete_text = ('Gus Proto is a Python developer currently'
    'working for a London-based Fintech company. He is'
    ' interested in learning Natural Language Processing.'
    ' There is a developer conference happening on 21 July'
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number '
    ' available at +1-1234567891. Gus is helping organize it.'
    ' He keeps organizing local Python meetups and several'
    ' internal talks at his workplace. Gus is also presenting'
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    ' Apart from his work, he is very passionate about music.'
    ' Gus is learning to play the Piano. He has enrolled '
    ' himself in the weekend batch of Great Piano Academy.'
    ' Great Piano Academy is situated in Mayfair or the City'
    ' of London and has world-class piano instructors.')

complete_doc = nlp(complete_text)
# Remove stop words and punctuation symbols
words = [token.text for token in complete_doc
         if not token.is_stop and not token.is_punct]

word_freq = Counter(words)
# 5 commonly occurring words with their frequencies
print("Top 5 common Words with their frequenc")
common_words = word_freq.most_common(5)
print (common_words)
print("\n")

# Unique words
print("Unique Words")
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print (unique_words)
print("\n")

#example why removing stop word is important
print("example why removing stop word is important")
words_all = [token.text for token in complete_doc if not token.is_punct]
word_freq_all = Counter(words_all)
# 5 commonly occurring words with their frequencies
common_words_all = word_freq_all.most_common(5)
print (common_words_all)

Top 5 common Words with their frequenc
[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]


Unique Words
['Proto', 'currentlyworking', 'based', 'company', 'interested', 'conference', 'happening', '21', 'July', '2019', 'titled', 'Applications', 'helpline', 'number', 'available', '+1', '1234567891', 'helping', 'organize', 'keeps', 'organizing', 'local', 'meetups', 'internal', 'talks', 'workplace', 'presenting', 'introduce', 'reader', 'Use', 'cases', 'Apart', 'work', 'passionate', 'music', 'play', 'enrolled', 'weekend', 'batch', 'situated', 'Mayfair', 'City', 'world', 'class', 'piano', 'instructors']


example why removing stop word is important
[('is', 10), ('a', 5), ('in', 5), ('Gus', 4), ('of', 4)]


In [19]:
#parts of speech tagging
for token in about_doc:
    print (token, token.tag_, token.pos_, spacy.explain(token.tag_))
    
#tag_ lists the fine-grained part of speech.
#pos_ lists the coarse-grained part of speech.
#spacy.explain gives descriptive details about a particular POS tag. 

nouns = []
adjectives = []
for token in about_doc:
    if token.pos_ == 'NOUN':
        nouns.append(token)
    if token.pos_ == 'ADJ':
        adjectives.append(token)
print(nouns)
print(adjectives)

Gus NNP PROPN noun, proper singular
Proto NNP PROPN noun, proper singular
is VBZ AUX verb, 3rd person singular present
a DT DET determiner
Python NNP PROPN noun, proper singular
developer NN NOUN noun, singular or mass
currently RB ADV adverb
working VBG VERB verb, gerund or present participle
for IN ADP conjunction, subordinating or preposition
a DT DET determiner
London NNP PROPN noun, proper singular
- HYPH PUNCT punctuation mark, hyphen
based VBN VERB verb, past participle
Fintech NNP PROPN noun, proper singular
company NN NOUN noun, singular or mass
. . PUNCT punctuation mark, sentence closer
He PRP PRON pronoun, personal
is VBZ AUX verb, 3rd person singular present
interested JJ ADJ adjective
in IN ADP conjunction, subordinating or preposition
learning VBG VERB verb, gerund or present participle
Natural NNP PROPN noun, proper singular
Language NNP PROPN noun, proper singular
Processing NNP PROPN noun, proper singular
. . PUNCT punctuation mark, sentence closer
[developer, company

In [None]:
# Visualization
from spacy import displacy
about_interest_text = ('He is interested in learning'+
    ' Natural Language Processing.')
about_interest_doc = nlp(about_interest_text)
displacy.serve(about_interest_doc, style='dep')

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



In [None]:
#demonstration of a preprocessing function
#preprocessing function that takes text as input and applies the following operations:

# Lowercases the text
# Lemmatizes each token
# Removes punctuation symbols
# Removes stop words
def is_token_allowed(token):
    '''
        Only allow valid tokens which are not stop words
        and punctuation symbols.
    '''
    if (not token or not token.string.strip() or
        token.is_stop or token.is_punct):
        return False
    return True

def preprocess_token(token):
    # Reduce token to its lowercase lemma form
    return token.lemma_.strip().lower()

#if token is allowed, then sent it to preprocess method and add it to the list
complete_filtered_tokens = [preprocess_token(token)
    for token in complete_doc if is_token_allowed(token)]

print(complete_filtered_tokens)

In [35]:
about_text = ('a unit rate is a ratio where the denominator is basically it compares different quantities so for an example 2.99/1 poind')
about_doc = nlp(about_text)

    
# #parts of speech tagging
# for token in about_doc:
#     print (token, token.tag_, token.pos_, spacy.explain(token.tag_))
    
#tag_ lists the fine-grained part of speech.
#pos_ lists the coarse-grained part of speech.
#spacy.explain gives descriptive details about a particular POS tag. 

nouns = []
adjectives = []
pron = []
conjunction = []
for token in about_doc:
    if token.pos_ == 'NOUN':
        nouns.append(token)
    if token.pos_ == 'ADJ':
        adjectives.append(token)
    if token.pos_ == 'PRON':
        pron.append(token)
    if token.pos_ == 'SCONJ':
        conjunction.append(token)
print('nouns :: ', nouns)
print('adjectives :: ',adjectives)
print('pronouns :: ',pron)
print('conjunction :: ',conjunction)

nouns ::  [unit, rate, ratio, denominator, quantities, example, poind]
adjectives ::  [different]
pronouns ::  [it]
conjunction ::  [so]
