In [None]:
from IPython.display import HTML, display

def my_css():
   display(HTML("""<style>table.dataframe td{white-space: nowrap;}</style>"""))

get_ipython().events.register('pre_run_cell', my_css)

In [None]:
!pip install sentencepiece
!pip install transformers

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import spacy
from spacy import displacy
from sklearn.decomposition import PCA

## **[Language Processing Pipeline with Spacy](https://spacy.io/usage/processing-pipelines)**

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
text = "It was the best of times, it was the worst of times. It was the age of wisdom, it was the age of foolishness. It was the epoch of belief, it was the epoch of incredulity,. It was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair. We had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only. There were a king with a large jaw and a queen with a plain face, on the throne of England; there were a king with a large jaw and a queen with a fair face, on the throne of France. In both countries it was clearer than crystal to the lords of the State preserves of loaves and fishes, that things in general were settled for ever."
text

'It was the best of times, it was the worst of times. It was the age of wisdom, it was the age of foolishness. It was the epoch of belief, it was the epoch of incredulity,. It was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair. We had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only. There were a king with a large jaw and a queen with a plain face, on the throne of England; there were a king with a large jaw and a queen with a fair face, on the throne of France. In both countries it was clearer than crystal to the lords of the State preserves of loaves and fishes, that things in general were settled for ever.'

### **Sentence Tokenization**

In [None]:
doc = nlp(text)
for sent in doc.sents:
    print(">", sent, sent.start, sent.end)

> It was the best of times, it was the worst of times. 0 14
> It was the age of wisdom, it was the age of foolishness. 14 28
> It was the epoch of belief, it was the epoch of incredulity,. 28 43
> It was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair. 43 71
> We had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only. 71 139
> There were a king with a large jaw and a queen with a plain face, on the throne of England; there were a king with a large jaw and a queen with a fair face, on the throne of France. 139 183
> In both countries it was clearer than crystal to the lords of the State preserves of loaves and fishes, that things in general were settled for ever. 183

### **Extracting entities**

In [None]:
for ent in doc.ents:
    print(ent, ent.label_)

England GPE
France GPE
State ORG


In [None]:
apple_doc = nlp("Steve Jobs and Steve Wozniak incorporated Apple Computer on January 3, 1977, in Cupertino, California.") 

for ent in apple_doc.ents:
    print(ent.text, ent.label_)

displacy.render(apple_doc, style="ent", jupyter=True)

Steve Jobs PERSON
Steve Wozniak PERSON
Apple Computer ORG
January 3, 1977 DATE
Cupertino GPE
California GPE


### **Lemmatization, POS-Tags, Syntax Trees**

In [None]:
for sent in doc.sents:
    for tok in sent:
        print(tok, tok.lemma_, spacy.explain(tok.pos_), tok.is_stop)
    break

It it pronoun True Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs
was be auxiliary True Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
the the determiner True Definite=Def|PronType=Art
best good adjective False Degree=Sup
of of adposition True 
times time noun False Number=Plur
, , punctuation False PunctType=Comm
it it pronoun True Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs
was be auxiliary True Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
the the determiner True Definite=Def|PronType=Art
worst bad adjective False Degree=Sup
of of adposition True 
times time noun False Number=Plur
. . punctuation False PunctType=Peri


In [None]:
for sent in doc.sents:
    for tok in sent:
        print(tok, tok.morph)
    break

It Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs
was Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
the Definite=Def|PronType=Art
best Degree=Sup
of 
times Number=Plur
, PunctType=Comm
it Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs
was Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
the Definite=Def|PronType=Art
worst Degree=Sup
of 
times Number=Plur
. PunctType=Peri


In [None]:
displacy.render(sent, style='dep', jupyter=True)

## **Tokenizers**

In [1]:
import nltk
from nltk import word_tokenize, TweetTokenizer, MWETokenizer
from transformers import AutoTokenizer

ModuleNotFoundError: ignored

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
text = "I ate 8.5 ice-creams in New Delhi 🥶😇"

In [None]:
word_tokenize(text)

['I', 'ate', '8.5', 'ice-creams', 'in', 'New', 'Delhi', '🥶😇']

In [None]:
tokenizer = TweetTokenizer()
tokenizer.tokenize(text)

['I', 'ate', '8.5', 'ice-creams', 'in', 'New', 'Delhi', '🥶', '😇']

In [None]:
tokenizer = MWETokenizer()
tokenizer.add_mwe(('New', 'Delhi'))
tokenizer.tokenize(word_tokenize(text))

['I', 'ate', '8.5', 'ice-creams', 'in', 'New_Delhi', '🥶😇']

**Subword Tokenization**

1.   **Byte-Pair Encoding (BPE):** BPE relies on a pre-tokenizer that splits the training data into words. Pretokenization can be as simple as space tokenization. After pre-tokenization, a set of unique words has been created and the frequency of each word it occurred in the training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the set of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so until the vocabulary has attained the desired vocabulary size. Used by GPT, GPT-2, RoBERTa models.
2.   **WordPiece:** WordPiece first initializes the vocabulary to include every character present in the training data and progressively learns a given number of merge rules. In contrast to BPE, WordPiece does not choose the most frequent symbol pair, but the one that maximizes the likelihood of the training data once added to the vocabulary. Used by BERT, DistilBERT, and Electra.
3.   **SentencePiece:** Above tokenizers assumed that the input text uses spaces to separate words. However, not all languages use spaces to separate words. To solve this, SentencePiece treats the input as a raw input stream, thus including the space in the set of characters to use. It then uses the BPE algorithm to construct the appropriate vocabulary. Some models that use SP are ALBERT, XLNet, Marian, and T5.



In [None]:
from transformers import GPT2Tokenizer, BertTokenizer, XLNetTokenizer

In [None]:
gpt2_tokenizer   = GPT2Tokenizer.from_pretrained("gpt2")
bert_tokenizer   = BertTokenizer.from_pretrained("bert-base-cased")
xlnet_tokenizer  = XLNetTokenizer.from_pretrained("xlnet-base-cased")

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
text = "It was the best of times, it was the worst of times."

print("GPT2 Tokenizer: ", gpt2_tokenizer.tokenize(text))
print("BERT Tokenizer: ", bert_tokenizer.tokenize(text))
print("XLNT Tokenizer: ", xlnet_tokenizer.tokenize(text))

GPT2 Tokenizer:  ['It', 'Ġwas', 'Ġthe', 'Ġbest', 'Ġof', 'Ġtimes', ',', 'Ġit', 'Ġwas', 'Ġthe', 'Ġworst', 'Ġof', 'Ġtimes', '.']
BERT Tokenizer:  ['It', 'was', 'the', 'best', 'of', 'times', ',', 'it', 'was', 'the', 'worst', 'of', 'times', '.']
XLNT Tokenizer:  ['▁It', '▁was', '▁the', '▁best', '▁of', '▁times', ',', '▁it', '▁was', '▁the', '▁worst', '▁of', '▁times', '.']


In [None]:
to_embed = "We would like to embed this extremely short text with an unknown word zozofah!"

print(gpt2_tokenizer.convert_ids_to_tokens(gpt2_tokenizer.encode(to_embed)))
print(bert_tokenizer.convert_ids_to_tokens(bert_tokenizer.encode(to_embed)))
print(xlnet_tokenizer.convert_ids_to_tokens(xlnet_tokenizer.encode(to_embed)))

['We', 'Ġwould', 'Ġlike', 'Ġto', 'Ġembed', 'Ġthis', 'Ġextremely', 'Ġshort', 'Ġtext', 'Ġwith', 'Ġan', 'Ġunknown', 'Ġword', 'Ġz', 'oz', 'of', 'ah', '!']
['[CLS]', 'We', 'would', 'like', 'to', 'em', '##bed', 'this', 'extremely', 'short', 'text', 'with', 'an', 'unknown', 'word', 'z', '##oz', '##of', '##ah', '!', '[SEP]']
['▁We', '▁would', '▁like', '▁to', '▁embed', '▁this', '▁extremely', '▁short', '▁text', '▁with', '▁an', '▁unknown', '▁word', '▁', 'zo', 'zo', 'fah', '!', '<sep>', '<cls>']


## References:


1.   Spacy - https://spacy.io/
2.   Tokenization: https://neptune.ai/blog/tokenization-in-nlp
3.   HF tokenizers: https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb

