### Install libraries

In [3]:
import sys
!{sys.executable} -m pip install spacy



In [4]:
!{sys.executable} -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
#Import libraries
import spacy
import pathlib

In [7]:
#load spacy's default language
nlp = spacy.load("en_core_web_sm")

### Getting started

In [8]:
# Construct a Doc object, which a sequence of token objects, representing a lexical token
# Each token has information about a particular piece of text
# Create a Doc object, which allows us to access information about the processed text
# The doc is separated into tokens, which breaks down the sentence into its individual words
introduction_doc = nlp("This tutorial is about Natural Language Processing in spaCy. ")
print(type(introduction_doc))
# Create a series of Token objects
print([token.text for token in introduction_doc])

<class 'spacy.tokens.doc.Doc'>
['This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'spaCy', '.']


In [21]:
import os

notebook_path = os.path.abspath("natural_language_processing/introduction.txt")
print(notebook_path)

introduction_doc = nlp(pathlib.Path(notebook_path).read_text(encoding="utf-8"))
print ([token.text for token in introduction_doc])

/Users/kenny/Developer/jupyter/natural_language_processing/introduction.txt
['This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'spaCy', '.']


### Sentence detection

In [30]:
# Locate where sentences start and end in a given text (simply find where the period is)
about_text = (
     "Gus Proto is a Python developer currently working for a London-based Fintech company. He is interested in learning Natural Language Processing."
)

# Create the nlp object
about_doc = nlp(about_text)
# .sents attribute 
sentences = list(about_doc.sents)
len(sentences)

for sentence in sentences:
    print(f"{sentence[::]}...")

Gus Proto is a Python developer currently working for a London-based Fintech company....
He is interested in learning Natural Language Processing....


### Creating custom sentence pipelines

In [38]:
ellipsis_text = ("Gus, can you, ... never mind, I forgot what I was saying. So, do you think we should ...")

from spacy.language import Language
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc [:-1]:
        if token.text == "...":
            doc[token.i+1].is_sent_start = True
    return doc
    
#Create another nlp class
custom_nlp = spacy.load("en_core_web_sm")
# Adding a custom pipeline
custom_nlp.add_pipe("set_custom_boundaries", before="parser")
# Create a doc object and pass in the text
custom_ellipsis_doc = custom_nlp(ellipsis_text)
# Iterating over sentences, the Doc objects' sentences occur at "..." and "."
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
    print(sentence)

Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...


### Tokens

In [40]:
# Tokenization breaks down a text into basic units or tokens
# Tokens have the word they store, and the index in the string, as an attribute
about_text = (
     "Gus Proto is a Python developer currently working for a London-based Fintech company. He is interested in learning Natural Language Processing."
)

nlp = spacy.load("en_core_web_sm")
about_doc = nlp(about_text)

for token in about_doc:
    print (token, token.idx)


Gus 0
Proto 4
is 10
a 13
Python 15
developer 22
currently 32
working 42
for 50
a 54
London 56
- 62
based 63
Fintech 69
company 77
. 84
He 86
is 89
interested 92
in 103
learning 106
Natural 115
Language 123
Processing 132
. 142


In [41]:
# Spacy also gives other types of information about a piece of text

print(
    f"{"Text with Whitespace":22}"
    f"{"Is Alphanumeric?":15}"
    f"{"Is Punctuation?":18}"
    f"{"Is Stop Word?"}"
)
for token in about_doc:
    print(
        # Trailing space or not
        f"{str(token.text_with_ws):22}"
        # Alphabetic characters or not
        f"{str(token.is_alpha):15}"
        # Is a punctuation symbol or not
        f"{str(token.is_punct):18}"
        # Is a stop word or not
        f"{str(token.is_stop)}"
    )


Text with Whitespace  Is Alphanumeric?Is Punctuation?   Is Stop Word?
Gus                   True           False             False
Proto                 True           False             False
is                    True           False             True
a                     True           False             True
Python                True           False             False
developer             True           False             False
currently             True           False             False
working               True           False             False
for                   True           False             True
a                     True           False             True
London                True           False             False
-                     False          True              False
based                 True           False             False
Fintech               True           False             False
company               True           False             False
.                  

### Custom tokenization process

In [53]:
custom_about_text = (
     "Gus Proto is a Python developer currently working for a London@based Fintech company. He is interested in learning Natural Language Processing."
)
# London@based is read as one token
print([token.text for token in nlp(custom_about_text)[8:15]])

import re
from spacy.tokenizer import Tokenizer

custom_nlp = spacy.load("en_core_web_sm")

# Generate the custom regex objects for the Tokenizer
prefix_re = spacy.util.compile_prefix_regex(
    custom_nlp.Defaults.prefixes
)
suffix_re = spacy.util.compile_suffix_regex(
    custom_nlp.Defaults.suffixes
)

# Add a custom infix with new regex patterns
custom_infixes = [r"@"]

# Join the custom infixes
infix_re = spacy.util.compile_infix_regex(
    list(custom_nlp.Defaults.infixes) + custom_infixes
)

# print(custom_nlp.Defaults.infixes)

# Create a custom nlp tokenizer
custom_nlp.tokenizer = Tokenizer(
    # Handles contractions and emoticons
    nlp.vocab,
    # Handles preceding punctuation (eg. opening parenthesis)
    prefix_search=prefix_re.search,
    # Handles ending punctuation  (eg. closing parenthesis)
    suffix_search=suffix_re.search,
    # Handles non-whitespace separators such as hyphens
    infix_finditer=infix_re.finditer,
    # Handles strings that should not split, including links and numbers
    token_match=None,
)

custom_tokenizer_about_doc = custom_nlp(custom_about_text)

print([token.text for token in custom_tokenizer_about_doc[8:15]])


['for', 'a', 'London@based', 'Fintech', 'company', '.', 'He']
['for', 'a', 'London', '@', 'based', 'Fintech', 'company']


In [60]:
# Stop words
# Defined as common words in a language such as "the, but, they, etc"
# Stop words are generally removed from NLP because they aren't significant
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)
# print(spacy_stopwords)

custom_about_text = (
     "Gus Proto is a Python developer currently working for a London@based Fintech company. He is interested in learning Natural Language Processing."
)

about_doc = nlp(custom_about_text)
print([token for token in about_doc if not token.is_stop])

[Gus, Proto, Python, developer, currently, working, London@based, Fintech, company, ., interested, learning, Natural, Language, Processing, .]


### Lemmatization

In [62]:
# Reducing inflected forms of a word while still enduring the reduced form belongs to the language
# The reduced word is a root word that is called a lemma
# eg. organize, organzed, organizing are all forms of the word organize, and organize is the lemma
# Lemmatization helps you reduce the inflected forms of a word so they can be analyzed as a single item
# Lemmatization helps you avoid duplicate words
conference_help_text = (
    "Gus is helping organize a developer conference on Applications of Natural Language Processing. He keeps organizing local Python meetups and several internal talks at his workplace."
)

conference_help_doc = nlp(conference_help_text)

for token in conference_help_doc:
    if str(token) != str(token.lemma_):
        print(f"{str(token):>20} : {str(token.lemma_)}")

                  is : be
                  He : he
               keeps : keep
          organizing : organize
             meetups : meetup
               talks : talk


In [68]:
# Word frequency analysis example
from collections import Counter
complete_text = ("Gus Proto is a Python developer currently working for a London-based Fintech company. He is interested in learning Natural Language Processing. There is a developer conference happening on 21 July 2019 in London. It is titled 'Applications of Natural Language Processing'. There is a helpline number Ωavailable at +44-1234567891. Gus is helping organize it. He keeps organizing local Python meetups and several internal talks at his workplace. Gus is also presenting a talk. The talk will introduce the reader about the use cases of Natural Language Processing in Fintech. Apart from his work, he is very passionate about music.")

complete_doc = nlp(complete_text)

# Remove all stop words
words = [
    token.text
    for token in complete_doc
    if not token.is_stop and not token.is_punct
]

print(Counter(words).most_common(10))

# If we did not remove stop words:

Counter(
    [token.text for token in complete_doc if not token.is_punct]
).most_common(8)


[('Gus', 3), ('Natural', 3), ('Language', 3), ('Processing', 3), ('Python', 2), ('developer', 2), ('London', 2), ('Fintech', 2), ('talk', 2), ('Proto', 1)]


[('is', 8),
 ('a', 5),
 ('Gus', 3),
 ('in', 3),
 ('Natural', 3),
 ('Language', 3),
 ('Processing', 3),
 ('Python', 2)]

### Speech tagging


In [78]:
# Speech tagging explains how a word is used in a particular sentence
# Noun, pronoun, adjective, verb, adverb, preposition, conjunction, injerjunction
# Part of speech tagging assigns a POS tag to each token depending on its usage in a sentence
# Assigns a noun or verb to each word
# Can be used to gauge sentiment by analyzing which adjectives are commonly used alongside nouns

about_text = (
     "Gus Proto is a Python developer currently working for a London-based Fintech company. He is interested in learning Natural Language Processing."
)

about_doc = nlp(about_text)
for token in about_doc:
    print(
        f"""
        TOKEN: {str(token)}
        =====
        TAG: {str(token.tag_):10} POS: {token.pos_}
        EXPLANATION: {spacy.explain(token.tag_)}"""
        )
        #.tag displays the fine-grained tag, and .pos displays the coarse-grained tag
        # These classify what the word's purpose is in a sentence in speech or writing



nouns = []
adjectives = []
for token in about_doc:
    if token.pos_ == "NOUN":
        nouns.append(token)
    if token.pos_ == "ADJ":
        adjectives.append(token)


print(nouns)


print(adjectives)


        TOKEN: Gus
        =====
        TAG: NNP        POS: PROPN
        EXPLANATION: noun, proper singular

        TOKEN: Proto
        =====
        TAG: NNP        POS: PROPN
        EXPLANATION: noun, proper singular

        TOKEN: is
        =====
        TAG: VBZ        POS: AUX
        EXPLANATION: verb, 3rd person singular present

        TOKEN: a
        =====
        TAG: DT         POS: DET
        EXPLANATION: determiner

        TOKEN: Python
        =====
        TAG: NNP        POS: PROPN
        EXPLANATION: noun, proper singular

        TOKEN: developer
        =====
        TAG: NN         POS: NOUN
        EXPLANATION: noun, singular or mass

        TOKEN: currently
        =====
        TAG: RB         POS: ADV
        EXPLANATION: adverb

        TOKEN: working
        =====
        TAG: VBG        POS: VERB
        EXPLANATION: verb, gerund or present participle

        TOKEN: for
        =====
        TAG: IN         POS: ADP
        EXPLANATION: conjun

### Visualization using displaCy

In [83]:
# Used to visualize dependency parse or named entities in a browser
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

about_interest_text = ( "He is interested in learning Natural Language Processing.")

about_interest_doc = nlp(about_interest_text)

displacy.render(about_interest_doc, style="dep", jupyter=True)

### Cleaning data example

In [88]:
# Lowercase the text
# Lemmatize each token
# Remove punctuation symbols
# Remove stop words

complete_text = ("Gus Proto is a Python developer currently working for a London-based Fintech company. He is interested in learning Natural Language Processing.There is a developer conference happening on 21 July2019 in London. It is titled 'Applications of NaturalLanguage Processing'. There is a helpline number available at +44-1234567891. Gus is helping organize it. He keeps organizing local Python meetups and several internal talks at his workplace. Gus is also presentinga talk. The talk will introduce the reader about 'Use'cases of Natural Language Processing in Fintech'.Apart from his work, he is very passionate about music.Gus is learning to play the Piano. He has enrolled himself in the weekend batch of Great Piano Academy. Great Piano Academy is situated in Mayfair or the City of London and has world-class piano instructors.")

complete_doc = nlp(complete_text)

# Define functions
def is_token_allowed(token):
    # Filter tokens that are NOT punctuation or stop words
    return bool(token and str(token).strip() and not token.is_stop and not token.is_punct)

def preprocess_token(token):
    # Lowercase, and remove whitespace
    return token.lemma_.strip().lower()

complete_filtered_tokens = [preprocess_token(token) for token in complete_doc if is_token_allowed(token)]

complete_filtered_tokens



['gus',
 'proto',
 'python',
 'developer',
 'currently',
 'work',
 'london',
 'base',
 'fintech',
 'company',
 'interested',
 'learn',
 'natural',
 'language',
 'processing',
 'developer',
 'conference',
 'happen',
 '21',
 'july2019',
 'london',
 'title',
 'application',
 'naturallanguage',
 'processing',
 'helpline',
 'number',
 'available',
 '+44',
 '1234567891',
 'gus',
 'helping',
 'organize',
 'keep',
 'organize',
 'local',
 'python',
 'meetup',
 'internal',
 'talk',
 'workplace',
 'gus',
 'presentinga',
 'talk',
 'talk',
 'introduce',
 'reader',
 "use'cases",
 'natural',
 'language',
 'processing',
 "fintech'",
 'apart',
 'work',
 'passionate',
 'music',
 'gus',
 'learn',
 'play',
 'piano',
 'enrol',
 'weekend',
 'batch',
 'great',
 'piano',
 'academy',
 'great',
 'piano',
 'academy',
 'situate',
 'mayfair',
 'city',
 'london',
 'world',
 'class',
 'piano',
 'instructor']

### Rule based matching

In [137]:
# One of the steps to extract information from unstructured text
# Identify and extract tokens based on patterns such as lowercase and grammatical features
# Match  a label according to lexical patterns and grammical features

# First and last names are always proper nouns
# Extracting names

complete_text = ("Bo is a Python developer at Kenny Guo currently working for a London-based Fintech company. He is interested in learning Natural Language Processing.There is a developer conference happening on 21 July2019 in London. It is titled 'Applications of NaturalLanguage Processing'. There is a helpline number available at +44-1234567891. Gus is helping organize it. He keeps organizing local Python meetups and several internal talks at his workplace. Gus is also presentinga talk. The talk will introduce the reader about 'Use'cases of Natural Language Processing in Fintech'.Apart from his work, he is very passionate about music.Gus is learning to play the Piano. He has enrolled himself in the weekend batch of Great Piano Academy. Great Piano Academy is situated in Mayfair or the City of London and has world-class piano instructors.")

complete_doc = nlp(complete_text)

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

def dump(obj):
  for attr in dir(obj):
    print("obj.%s = %r" % (attr, getattr(obj, attr)))

# dump(nlp.vocab)

# pattern is a list of objects, of the combination of tokens to be matched
# pattern consists of two objects in which POS tags for both tokens are PROPN (proper nouns)
# The pattern is added to Matcher with the .add method
def extract_full_name(nlp_doc):
    pattern = [{"POS": "PROPN"}, {"POS": "PROPN"}]
    matcher.add("FULL_NAME", [pattern])
    matches = matcher(nlp_doc)
    for _, start, end in matches:
        span = nlp_doc[start:end]
        yield span.text


next(extract_full_name(complete_doc))

'Kenny Guo'

In [136]:
conference_org_text = ("There is a developer conference happening on 21 July 2019 in London. It is titled the applications of natural language processing. There is a helpline number available at (123) 456-7891 for you")


def extract_phone_number(nlp_doc):
    pattern = [
        # ORTH matches the exact text of the token
        # SHAPE transforms the token to show orthographic features
        # OP defines operators - using ? means that a pattern is optional
        {"ORTH": "("}, # opening bracket
        {"SHAPE": "ddd"}, # 3 digits
        {"ORTH": ")"}, # closing bracket
        {"SHAPE": "ddd"}, #3 digits
        {"ORTH": "-", "OP": "?"}, # can have a hyphen
        {"SHAPE": "dddd"}, # 4 digits
    ]
    matcher.add("PHONE_NUMBER", [pattern])
    matches = matcher(nlp_doc)
    print(matches)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        print(span)


conference_org_doc = nlp(conference_org_text)
extract_phone_number(conference_org_doc)



[(2475408655227838177, 20, 22), (10788718092470551940, 30, 36)]
Language Processing
(123) 456-7891


### Depedency parsing

In [139]:
# Extracting the dependency graph of a sentence to represent its grammatical structure
# Defines relationships between headwords and their dependents
# The head of a sentence has no dependency and is called the root of a sentence

piano_text = "Gus is learning piano"

piano_doc = nlp(piano_text)

for token in piano_doc:
    print(
        f"""
TOKEN: {token.text}
=====
{token.tag_ = }
{token.head.text = }
{token.dep_ = }"""
    )
# Prints the tag, the word, and its dependency with respect to a sentence

displacy.render(piano_doc, style="dep", jupyter=True)


TOKEN: Gus
=====
token.tag_ = 'NNP'
token.head.text = 'learning'
token.dep_ = 'nsubj'

TOKEN: is
=====
token.tag_ = 'VBZ'
token.head.text = 'learning'
token.dep_ = 'aux'

TOKEN: learning
=====
token.tag_ = 'VBG'
token.head.text = 'learning'
token.dep_ = 'ROOT'

TOKEN: piano
=====
token.tag_ = 'NN'
token.head.text = 'learning'
token.dep_ = 'dobj'


### Trees and subtree navigation


In [None]:
# Dependency graphs have all the properties of a tree and can be traversed like one

one_line_about_text = (
    "Gus Proto is a Python developer"
    " currently working for a London-based Fintech company"
)

one_line_about_doc = nlp(one_line_about_text)

# Extract children of `developer`
print([token.text for token in one_line_about_doc[5].children])

# Extract previous neighboring node of `developer`
print (one_line_about_doc[5].nbor(-1))


# Extract next neighboring node of `developer`
print (one_line_about_doc[5].nbor(1))


# Extract all tokens on the left of `developer`
print([token.text for token in one_line_about_doc[5].lefts])


# Extract tokens on the right of `developer`
print([token.text for token in one_line_about_doc[5].rights])


# Print subtree of `developer`
print("This is the subtree of 'developer':")
print (list(one_line_about_doc[5].subtree))

### Shallow parsing


In [160]:
# Extracting phrases from unstructured text
# Involves chunking groups of adjacent tokens into phrases on the basis of POS tags
# Well-known chunks include noun phrases, verb phrases, and preposiitonal phrases

# Noun phrase detection - has a noun at its head and can include adjectives, ordinals and determiners
conference_text = (
    "There is a developer conference happening on 21 July 2019 in London."
)
conference_doc = nlp(conference_text)

# Extract Noun Phrases
for chunk in conference_doc.noun_chunks:
    print (chunk)


# Verb phrase detection - syntactic unit composed of atleast one verb
# Must use python -m pip install textacy


a developer conference
21 July
London


In [1]:
import textacy

about_talk_text = (
    "The talk will introduce reader about use"
    " cases of Natural Language Processing in"
    " Fintech, making use of"
    " interesting examples along the way."
)

patterns = [{"POS": "AUX"}, {"POS": "VERB"}]
about_talk_doc = textacy.make_spacy_doc(
    about_talk_text, lang="en_core_web_sm"
)
verb_phrases = textacy.extract.token_matches(
    about_talk_doc, patterns=patterns
)

# Print all verb phrases
for chunk in verb_phrases:
    print(chunk.text)



# Extract noun phrase to explain what nouns are involved
for chunk in about_talk_doc.noun_chunks:
    print (chunk)


ModuleNotFoundError: No module named 'textacy'

In [164]:
# Named entity recognition
# Process of locating named entities to classify them in categories
# Populate tags for a set of documents, in order to improve the keyword search.

piano_class_text = (
    "Great Piano Academy is situated"
    " in Mayfair or the City of London and has"
    " world-class piano instructors."
)

piano_class_doc = nlp(piano_class_text)

for ent in piano_class_doc.ents:
    # .text gives the actual text
    # .start_char denotes the starting index
    # .end_char denotes the ending index
    # .label gives the label of the entity
    print(
        f"""
{ent.text = }
{ent.start_char = }
{ent.end_char = }
{ent.label_ = }
spacy.explain('{ent.label_}') = {spacy.explain(ent.label_)}"""
)

displacy.render(piano_class_doc, style="dep", jupyter=True)

    


ent.text = 'Great Piano Academy'
ent.start_char = 0
ent.end_char = 19
ent.label_ = 'ORG'
spacy.explain('ORG') = Companies, agencies, institutions, etc.

ent.text = 'Mayfair'
ent.start_char = 35
ent.end_char = 42
ent.label_ = 'GPE'
spacy.explain('GPE') = Countries, cities, states

ent.text = 'the City of London'
ent.start_char = 46
ent.end_char = 64
ent.label_ = 'GPE'
spacy.explain('GPE') = Countries, cities, states


In [165]:
# Example in removing names from a survey_text

survey_text = (
    "Out of 5 people surveyed, James Robert,"
    " Julie Fuller and Benjamin Brooks like"
    " apples. Kelly Cox and Matthew Evans"
    " like oranges."
)


def replace_person_names(token):
    # IOB code of the name - inside outside beginning tagging
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    return token.text_with_ws


def redact_names(nlp_doc):
    with nlp_doc.retokenize() as retokenizer:
        for ent in nlp_doc.ents:
            retokenizer.merge(ent)
    tokens = map(replace_person_names, nlp_doc)
    return "".join(tokens)


survey_doc = nlp(survey_text)
print(redact_names(survey_doc))




Out of 5 people surveyed, [REDACTED] , [REDACTED] and [REDACTED] like apples. [REDACTED] and [REDACTED] like oranges.
