In [87]:
import pandas as pd
import numpy as np
import spacy
from IPython.core.display import HTML

In [159]:
class TokenAttributes:
    def __init__(self,doc):
        self.doc = doc
        self.token: None
        self.lemma: None
        self.pos: None
        self.synDep: None
        self.pos_exp: None
        self.synDep_exp: None
        self.df: pd.DataFrame
        TokenAttributes.__get_attributes(self,doc)
    
    def __get_attributes(self,doc):
        keys = ["Token","Lemma","Part_of_Speech_Tag","Part_of_Speech_Tag_Explained", "Syntactic_Dependency", "Syntactic_Dependency_Explained"]
        self.token = [token for token in doc]
        self.lemma = [token.lemma_ for token in doc]
        self.pos = [token.pos_ for token in doc]
        self.synDep = [token.dep_ for token in doc]
        self.pos_explain = [spacy.explain(token.pos_) for token in doc]
        self.synDep_explain = [spacy.explain(token.dep_) for token in doc]
        token_attr = [self.token,self.lemma,self.pos,self.pos_explain,self.synDep,self.synDep_explain]
        ta_dict = {k:v for k,v in zip(keys,token_attr)}
        self.df = pd.DataFrame(ta_dict)


# Trained Pipelines

- Statistical Models that enable spaCy to predict linguistic attributes in context
    - Part-of-Speech tags
    - Syntactic dependencies
    - Named entities

- Can be trained with more examples to fine-tune predictions

- Also contains:
    - Binary weights
    - Vocabulary
    - Meta information
    - Configuration File

In [89]:
import spacy

# Trained Statistical Pipeline Model
nlp = spacy.load("en_core_web_sm")

In [90]:
type(nlp)

spacy.lang.en.English

# Tokenization

- Parsing text into tokens
- First step in any NLP application
- when you pass text to the statistical model to create a doc object tokenization happens automatically

### Text Content

- The group of letters that make up the token

In [91]:
# A statistical model class
# A spacy language class
# Contains language vocabulary and more!
# The predictions are not always right!

nlp = spacy.load("en_core_web_sm")

In [92]:
# A doc object and the start of a NLP pipeline
# instantiate a doc object that will process and tokenize the text
# Will also produce predicted lingustic annotations

doc = nlp(u"I am flying to Frisco")

In [93]:
doc

I am flying to Frisco

In [94]:
type(doc)

spacy.tokens.doc.Doc

In [95]:
# doc objects are also itterables and can be sliced like arrays and lists
# Each base element of a doc is a token

doc[2]

flying

In [96]:
type(doc[2])

spacy.tokens.token.Token

In [97]:
# Tokens have attributes

print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Frisco']


In [98]:
# Creates a blank english NLP object
# creates a blank statistical model...maybe?
nlp = spacy.blank("en")

In [99]:
doc2 = nlp(u"I am flying to Frisco")

In [100]:
[token.lemma_ for token in doc]

['I', 'be', 'fly', 'to', 'Frisco']

# lexical attributes

- Lingustic annotations available as token attributes
- Check if a token is like a number using the like_num attribute

In [101]:
import spacy

In [102]:
nlp = spacy.blank("en")

doc = nlp(
        "In 1990, more than 60% of people in East Asia were in extreme poverty. "
        "Now less than 4% are."
)

In [103]:
t1 = doc[3]

In [104]:
type(t1)

spacy.tokens.token.Token

In [105]:
t1

more

In [106]:
# Token doc index
t1.i

3

In [107]:
# Boolean check if as token is like a number
t1.like_num

False

In [108]:
import spacy

nlp = spacy.blank("en")

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.i, token.text)

Percentage found: 5 60
Percentage found: 20 4


# Predict Part-of-speach Tags
- Tells you the part-of-speech of a given word / token
- If that token is a noun, verb, adjective etc.

In [109]:
import spacy

# Trained statistical pipeline that has a .pos_ attribute
nlp = spacy.load("en_core_web_sm")

# instantiate a doc object
doc = nlp("This is a statistical model to help spacy predict the context of texts!")

# list of tokens
token = [token for token in doc]

# Predicted values for part-of-speech recognition
part_of_speech = [token.pos_ for token in doc]

token_pos = dict(Token = token, PartOfSpeech = part_of_speech, PartOfSpeechExplained = [spacy.explain(token.pos_) for token in doc])

df = pd.DataFrame(data=token_pos)

df

Unnamed: 0,Token,PartOfSpeech,PartOfSpeechExplained
0,This,PRON,pronoun
1,is,AUX,auxiliary
2,a,DET,determiner
3,statistical,ADJ,adjective
4,model,NOUN,noun
5,to,PART,particle
6,help,VERB,verb
7,spacy,NOUN,noun
8,predict,VERB,verb
9,the,DET,determiner


In [110]:
ta = TokenAttributes(doc)

# Lemmatization

- The process of reducing word forms to their lemma
- lemma
    -  The base form of a token
- How the token would look in the dictonary
- Important task in meaning recognition
- Using lemmas can shorten the list of predefined keywords that you need
- So that you diont need to include all word forms of a token

### Grammatical Structure of a Sentence?

In [112]:
nlp = spacy.load("en_core_web_sm")

In [113]:
# doc objects allow you to access the grammatical strutuce of a sentence

doc3 = nlp(u"This product integrates both libraries for downloading and applying patches")

In [114]:
[(token.text, token.lemma_) for token in doc3]

[('This', 'this'),
 ('product', 'product'),
 ('integrates', 'integrate'),
 ('both', 'both'),
 ('libraries', 'library'),
 ('for', 'for'),
 ('downloading', 'download'),
 ('and', 'and'),
 ('applying', 'apply'),
 ('patches', 'patch')]

In [115]:
token = [token.text for token in doc3]
lemma = [token.lemma_ for token in doc3]

pd.DataFrame(dict(token = token, lemma = lemma))

Unnamed: 0,token,lemma
0,This,this
1,product,product
2,integrates,integrate
3,both,both
4,libraries,library
5,for,for
6,downloading,download
7,and,and
8,applying,apply
9,patches,patch


# Syntactic Dependency Label
 - Syntactic Dependency Parser
    -   Describes the syntactic relationship between two words
 -  Sentence structure formed by directed binary grammatical relations between the tokens
 - Discover syntactic relations between individual tokens
 - Connect syntactically related words in a arc / binary tree like structure
 - Kinda like a binary tree

In [116]:
HTML("""
<img src=
"source/syntacticDependencyExample1.png"
width="900"
height="300">
""")

In [117]:
HTML("""
<img src=
"source/syntacticDependencyExample2.png"
width="1200"
height="500">
""")

In [118]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp("I prefer the morning flight through Denver")

In [119]:
# The root node / head / syntactic govener / parent
# of the first element

doc[0].head.text

'prefer'

In [120]:
keys = ["Token","Lemma","Part_of_Speech_Tag","Part_of_Speech_Tag_Explained", "Syntactic_Dependency", "Syntactic_Dependency_Explained"]

token = [token for token in doc] # all tokens of a doc
lemma = [token.lemma_ for token in doc] # all lemma's of each token in the doc object
pos = [token.pos_ for token in doc] # model predicted part of speech tags for each token in the doc object
synDep = [token.dep_ for token in doc] # pipeline predicted syntactic dependency for each token in the doc object
pos_explain = [spacy.explain(token.pos_) for token in doc]
synDep_explain = [spacy.explain(token.dep_) for token in doc]

token_attr = [token,lemma,pos,pos_explain,synDep,synDep_explain]

ta_dict = {k:v for k,v in zip(keys,token_attr)}

df = pd.DataFrame(ta_dict)

In [123]:
df

Unnamed: 0,Token,Lemma,Part_of_Speech_Tag,Part_of_Speech_Tag_Explained,Syntactic_Dependency,Syntactic_Dependency_Explained
0,I,I,PRON,pronoun,nsubj,nominal subject
1,prefer,prefer,VERB,verb,ROOT,root
2,the,the,DET,determiner,det,determiner
3,morning,morning,NOUN,noun,compound,compound
4,flight,flight,NOUN,noun,dobj,direct object
5,through,through,ADP,adposition,prep,prepositional modifier
6,Denver,Denver,PROPN,proper noun,pobj,object of preposition


# Named Entity Recognition

- A real object that you can refer to by proper name
- Like location, organization name, person or other entity
- Reveals the place or organization the user is talking about

In [124]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [125]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [126]:
doc.ents

(Apple, U.K., $1 billion)

In [127]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


## spacy.explain Method

- Gives you explainations of linguistic attributes
- Brief explination on what the label dor tag means

In [128]:
named_ent_tags = ["GPE","NNP","dobj","PROPN"]

In [129]:
for tag in named_ent_tags:
    print(f"{tag}:",spacy.explain(tag))

GPE: Countries, cities, states
NNP: noun, proper singular
dobj: direct object
PROPN: proper noun


# Rule-based Matching

- Lets you write rules to find words and phrases in text

#### Why use Rule-based Matching over RegEx?

##### Rule-based Matching allows you to:

- Match on doc objects and not just strings

- Match on tokens and token attributes

- Can search for text and other lexical attributes of a token making RBM more flexible that RegEx

- Use a models predictions

- Example:
    - Find the word duck only if its a noun
    - duck(verb) vs duck(noun)


##### Match Patterns

- A list of dictionaries
- One key, value pair dict per token
- You can match:

    - Exact token text
        - [{"TEXT": "iPhone"}, {"TEXT": "X"}]
        
    - Match lexical attributes
        - [{"LOWER": "iphone"}, {"LOWER": "x"}]
        
    - match any token attribute
        - [{"LEMMA": "buy"}, {"POS": "NOUN"}]

In [130]:
import spacy
from spacy.matcher import Matcher # import the Matcher

In [131]:
# Load pretrained pipeline
nlp = spacy.load("en_core_web_sm")

In [132]:
nlp.vocab

<spacy.vocab.Vocab at 0x7f8a5be433a0>

In [206]:
# initialize the matcher object with a sharded vocabulary using nlp.vocab

matcher = Matcher(nlp.vocab)

In [207]:
# process some text / discourse
doc = nlp("Upcoming iPhone X release date leaked")


In [208]:
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

matcher.add("IPHONE_PATTERN", [pattern])

In [209]:
matches = matcher(doc)

matches

[(9528407286733565721, 1, 3)]

In [211]:
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X


In [210]:
TokenAttributes(doc).df

Unnamed: 0,Token,Lemma,Part_of_Speech_Tag,Part_of_Speech_Tag_Explained,Syntactic_Dependency,Syntactic_Dependency_Explained
0,Upcoming,upcoming,ADJ,adjective,amod,adjectival modifier
1,iPhone,iPhone,PROPN,proper noun,compound,compound
2,X,x,NOUN,noun,compound,compound
3,release,release,NOUN,noun,compound,compound
4,date,date,NOUN,noun,ROOT,root
5,leaked,leak,VERB,verb,acl,clausal modifier of noun (adjectival clause)


# Custom Lemmatization - Special Cases

-   When there are special cases like nicknames of cities bring passed to the NLP application you can create custom lemma's for words

In [136]:
import spacy
from spacy.symbols import ORTH, LEMMA

In [137]:
ORTH = "Frisco"

In [138]:
LEMMA = "San Francisco"

In [139]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("I am flying to Frisco")

In [140]:
[token for token in doc]

[I, am, flying, to, Frisco]

In [141]:
[token.text for token in doc]

['I', 'am', 'flying', 'to', 'Frisco']