In [80]:
import pandas as pd
import numpy as np
import spacy

# Trained Pipelines

- Statistical Models that enable spaCy to predict linguistic attributes in context
    - Part-of-Speech tags
    - Syntactic dependencies
    - Named entities

- Can be trained with more examples to fine-tune predictions

- Also contains:
    - Binary weights
    - Vocabulary
    - Meta information
    - Configuration File

In [146]:
import spacy

# Trained Statistical Pipeline Model
nlp = spacy.load("en_core_web_sm")

# Tokenization

- Parsing text into tokens
- First step in any NLP application
- when you pass text to the statistical model to create a doc object tokenization happens automatically

### Text Content

- The group of letters that make up the token

In [81]:
# A statistical model class
# A spacy language class
# Contains language vocabulary and more!

nlp = spacy.load("en_core_web_sm")

In [134]:
# A doc object and the start of a NLP pipeline
# Process and Tokenize the text to instantiate a doc object

doc = nlp(u"I am flying to Frisco")

In [136]:
doc

I am flying to Frisco

In [135]:
type(doc)

spacy.tokens.doc.Doc

In [141]:
# doc objects are also itterables and can be sliced like arrays and lists
# Each base element of a doc is a token

doc[2]

flying

In [142]:
type(doc[2])

spacy.tokens.token.Token

In [84]:
# Tokens have attributes

print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Frisco']


In [143]:
# Creates a blank english NLP object
# creates a blank statistical model...maybe?
nlp = spacy.blank("en")

In [99]:
doc2 = nlp(u"I am flying to Frisco")

In [100]:
[token.lemma_ for token in doc]

['I', 'be', 'fly', 'to', 'Frisco']

# lexical attributes

- Lingustic annotations available as token attributes
- Check if a token is like a number using the like_num attribute

In [107]:
import spacy

In [108]:
nlp = spacy.blank("en")

doc = nlp(
        "In 1990, more than 60% of people in East Asia were in extreme poverty. "
        "Now less than 4% are."
)

In [115]:
t1 = doc[3]

In [128]:
type(t1)

spacy.tokens.token.Token

In [116]:
t1

more

In [123]:
# Token doc index
t1.i

3

In [127]:
# Boolean check if as token is like a number
t1.like_num

False

In [154]:
import spacy

nlp = spacy.blank("en")

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.i, token.text)

Percentage found: 5 60
Percentage found: 20 4


# Predict Part-of-speach Tags With a Trained Statistical Model

In [153]:
import spacy

# Trained statistical pipeline that has a .pos_ attribute
nlp = spacy.load("en_core_web_sm")

# instantiate a doc object
doc = nlp("This is a statistical model to help spacy predict the context of texts!")

# list of tokens
token = [token for token in doc]

# Predicted values for part-of-speech recognition
part_of_speech = [token.pos_ for token in doc]

token_pos = dict(Token = token, PartOfSpeech = part_of_speech)

df = pd.DataFrame(data=token_pos)

df

Unnamed: 0,Token,PartOfSpeech
0,This,PRON
1,is,AUX
2,a,DET
3,statistical,ADJ
4,model,NOUN
5,to,PART
6,help,VERB
7,spacy,NOUN
8,predict,VERB
9,the,DET


# Lemmatization

- The process of reducing word forms to their lemma
- lemma
    -  The base form of a token
- How the token would look in the dictonary
- Important task in meaning recognition
- Using lemmas can shorten the list of predefined keywords that you need
- So that you diont need to include all word forms of a token

### Grammatical Structure of a Sentence?

In [87]:
nlp = spacy.load("en_core_web_sm")

In [88]:
# doc objects allow you to access the grammatical strutuce of a sentence

doc3 = nlp(u"This product integrates both libraries for downloading and applying patches")

In [89]:
[(token.text, token.lemma_) for token in doc3]

[('This', 'this'),
 ('product', 'product'),
 ('integrates', 'integrate'),
 ('both', 'both'),
 ('libraries', 'library'),
 ('for', 'for'),
 ('downloading', 'download'),
 ('and', 'and'),
 ('applying', 'apply'),
 ('patches', 'patch')]

In [90]:
token = [token.text for token in doc3]
lemma = [token.lemma_ for token in doc3]

pd.DataFrame(dict(token = token, lemma = lemma))

Unnamed: 0,token,lemma
0,This,this
1,product,product
2,integrates,integrate
3,both,both
4,libraries,library
5,for,for
6,downloading,download
7,and,and
8,applying,apply
9,patches,patch


# Custom Lemmatization - Special Cases

-   When there are special cases like nicknames of cities bring passed to the NLP application you can create custom lemma's for words

In [91]:
import spacy
from spacy.symbols import ORTH, LEMMA

In [92]:
ORTH = "Frisco"

In [93]:
LEMMA = "San Francisco"

In [94]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("I am flying to Frisco")

In [95]:
[token for token in doc]

[I, am, flying, to, Frisco]

In [96]:
[token.text for token in doc]

['I', 'am', 'flying', 'to', 'Frisco']