# Introduction


**What?** Introduction to SpaCy



# Imports

In [3]:
import spacy
from nltk.tokenize import TweetTokenizer
import warnings
warnings.filterwarnings("ignore")

# Tokenisation

In [4]:
nlp = spacy.load('en_core_web_sm')
text = "Mary, don’t slap the green witch" 
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', 'n’t', 'slap', 'the', 'green', 'witch']


In [10]:
tweet = u"Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)" 
tokenizer = TweetTokenizer() 
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


# Unigrams, Bigrams, Trigrams, ..., N-grams

In [14]:
def n_grams(text, n):
    """Custom-made n-grams constructor
    takes tokens or text, returns a list of n-grams
    """
    return [text[i:i+n] for i in range(len(text)-n+1)]

In [15]:
cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.'] 
print(n_grams(cleaned, 3))

[['mary', ',', "n't"], [',', "n't", 'slap'], ["n't", 'slap', 'green'], ['slap', 'green', 'witch'], ['green', 'witch', '.']]


# Lemmas and Stems

In [17]:
doc = nlp(u"he was running late for school") 
for token in doc:
    print('{} --> {}'.format(token, token.lemma_))

he --> he
was --> be
running --> run
late --> late
for --> for
school --> school


# Categorizing Words: POS Tagging


- We can extend the concept of labeling from documents to individual words or tokens.
- A common example ofcategorizing words is part-of-speech (POS) tagging.



In [None]:
doc = nlp(u"Mary slapped the green witch.") 
for token in doc:
    print('{} - {}'.format(token, token.pos_))

# Categorizing Spans: Chunking and Named Entity Recognition


- Often, we need to label a span of text; that is, a contiguous multitoken boundary. 
- For example, consider the sentence, “Mary slapped the green witch.” 
- We might want to identify the noun phrases (NP) and verb phrases (VP) in it, as shown here: This is called **chunking** or **shallow parsing**.
- Shallow parsing aims to derive higher-order units composed of the grammatical atoms, like nouns, verbs, adjectives, and so on. NP stands for Noun Phrase.



In [20]:
doc = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
    print('{} - {}'.format(chunk, chunk.label_))

Mary - NP
the green witch - NP


# References


- Rao, Delip, and Brian McMahan. Natural language processing with PyTorch: build intelligent language applications using deep learning. " O'Reilly Media, Inc.", 2019.
- https://github.com/joosthub/PyTorchNLPBook
    
