In [4]:
from IPython.display import HTML, display

def my_css():
   display(HTML(""""""))

get_ipython().events.register('pre_run_cell', my_css)

In [None]:
!pip install folium==0.2.1
!pip install datasets

# Make sure that we have a recent version of pyarrow in the session before we continue - otherwise reboot Colab to activate it
import pyarrow
if int(pyarrow.__version__.split('.')[1]) < 16 and int(pyarrow.__version__.split('.')[0]) == 0:
    import os
    os.kill(os.getpid(), 9)

!pip install sentencepiece
!pip install transformers

Collecting folium==0.2.1
  Downloading folium-0.2.1.tar.gz (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.0/70.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25l[?25hdone
  Created wheel for folium: filename=folium-0.2.1-py3-none-any.whl size=79794 sha256=3a0f658fa9957fe6f181a767080c0e61cd250b89f32c1634ed556f3fe2a5085d
  Stored in directory: /root/.cache/pip/wheels/00/0c/07/d7792a5444d5bb074361ac27da53cee9d5cce59a07fe9da5dd
Successfully built folium
Installing collected packages: folium
  Attempting uninstall: folium
    Found existing installation: folium 0.14.0
    Uninstalling folium-0.14.0:
      Successfully uninstalled folium-0.14.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency co

[31mERROR: Operation cancelled by user[0m[31m


In [1]:
import numpy as np
import matplotlib.pyplot as plt

import spacy
from spacy import displacy
from sklearn.decomposition import PCA

# [Language Processing Pipeline with Spacy](https://spacy.io/usage/processing-pipelines)

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
text = "It was the best of times, it was the worst of times. It was the age of wisdom, it was the age of foolishness. It was the epoch of belief, it was the epoch of incredulity,. It was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair. We had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only. There were a king with a large jaw and a queen with a plain face, on the throne of England; there were a king with a large jaw and a queen with a fair face, on the throne of France. In both countries it was clearer than crystal to the lords of the State preserves of loaves and fishes, that things in general were settled for ever."
print(text)

## Sentence Tokenization

In [None]:
doc = nlp(text)
for sent in doc.sents:
    print(">", sent, sent.start, sent.end)

## Extracting entities

In [None]:
for ent in doc.ents:
    print(ent, ent.label_)

In [None]:
apple_doc = nlp("Steve Jobs and Steve Wozniak incorporated Apple Computer on January 3, 1977, in Cupertino, California.")

for ent in apple_doc.ents:
    print(ent.text, ent.label_)

displacy.render(apple_doc, style="ent", jupyter=True)

## Lemmatization, POS-Tags, Syntax Trees

In [None]:
for sent in doc.sents:
    for tok in sent:
        print(tok, tok.lemma_, spacy.explain(tok.pos_), tok.is_stop)
    break

In [None]:
for sent in doc.sents:
    for tok in sent:
        print(tok, tok.morph)
    break

In [None]:
displacy.render(sent, style='dep', jupyter=True)

# Tokenization

In [None]:
import nltk
from nltk import word_tokenize, TweetTokenizer, MWETokenizer
from transformers import AutoTokenizer

In [None]:
nltk.download('punkt')

In [None]:
text = "I ate 8.5 ice-creams in New Delhi 🥶😇"

In [None]:
word_tokenize(text)

In [None]:
tokenizer = TweetTokenizer()
tokenizer.tokenize(text)

In [None]:
tokenizer = MWETokenizer()
tokenizer.add_mwe(('New', 'Delhi'))
tokenizer.tokenize(word_tokenize(text))

Subword Tokenization

**1. Byte-Pair Encoding (BPE):** BPE relies on a pre-tokenizer that splits the training data into words. Pretokenization can be as simple as space tokenization. After pre-tokenization, a set of unique words has been created and the frequency of each word it occurred in the training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the set of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so until the vocabulary has attained the desired vocabulary size. Used by GPT, GPT-2, RoBERTa models.  

**2. WordPiece:** WordPiece first initializes the vocabulary to include every character present in the training data and progressively learns a given number of merge rules. In contrast to BPE, WordPiece does not choose the most frequent symbol pair, but the one that maximizes the likelihood of the training data once added to the vocabulary. Used by BERT, DistilBERT, and Electra.  

**3. SentencePiece:** Above tokenizers assumed that the input text uses spaces to separate words. However, not all languages use spaces to separate words. To solve this, SentencePiece treats the input as a raw input stream, thus including the space in the set of characters to use. It then uses the BPE algorithm to construct the appropriate vocabulary. Some models that use SP are ALBERT, XLNet, Marian, and T5.

In [None]:
from transformers import GPT2Tokenizer, BertTokenizer, XLNetTokenizer

In [None]:
gpt2_tokenizer   = GPT2Tokenizer.from_pretrained("gpt2")
bert_tokenizer   = BertTokenizer.from_pretrained("bert-base-cased")
xlnet_tokenizer  = XLNetTokenizer.from_pretrained("xlnet-base-cased")

In [None]:
text = "It was the best of times, it was the worst of times."

print("GPT2 Tokenizer: ", gpt2_tokenizer.tokenize(text))
print("BERT Tokenizer: ", bert_tokenizer.tokenize(text))
print("XLNT Tokenizer: ", xlnet_tokenizer.tokenize(text))

In [None]:
to_embed = "We would like to embed this extremely short text with an unknown word zozofah!"

print(gpt2_tokenizer.convert_ids_to_tokens(gpt2_tokenizer.encode(to_embed)))
print(bert_tokenizer.convert_ids_to_tokens(bert_tokenizer.encode(to_embed)))
print(xlnet_tokenizer.convert_ids_to_tokens(xlnet_tokenizer.encode(to_embed)))

## Finetuning and Evaluation on MRPC dataset

In [None]:
import torch

from datasets import load_dataset, list_datasets, list_metrics
from pprint import pprint
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification

In [None]:
datasets = list_datasets()
print(len(datasets), datasets[0:10])

In [None]:
glue_dataset = list_datasets(with_details=True)[datasets.index('glue')]

In [None]:
pprint(glue_dataset)

### GLUE: A MULTI-TASK BENCHMARK AND ANALYSIS PLATFORM FOR NATURAL LANGUAGE UNDERSTANDING

GLUE contains 11 tasks including MRPC, STS, QQP, and several NLI tasks. More details are available on https://gluebenchmark.com/tasks.

MRPC (Microsoft Research Paraphrase Corpus): https://www.microsoft.com/en-us/download/details.aspx?id=52398

5800 pairs of sentences have been extracted from news sources on the web, along with human annotations indicating whether each pair captures a paraphrase/semantic equivalence relationship.

In [None]:
train_dataset = load_dataset('glue', 'mrpc', split='train')
test_dataset = load_dataset('glue', 'mrpc', split='test')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
def encode(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length')

train_dataset = train_dataset.map(encode, batched=True)
test_dataset = test_dataset.map(encode, batched=True)
print(train_dataset[0])

In [None]:
print(train_dataset[0].keys(), train_dataset[0]['label'])

Let's use a BERT model for [classification](https://huggingface.co/docs/transformers/en/model_doc/bert#transformers.BertForSequenceClassification).

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased')

In [None]:
train_dataset = train_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

In [None]:
model.train().to(device)

In [None]:
ckpt_path = ""

In [None]:
for epoch in range(2):
    for i, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            print(f"loss: {loss}")

    if epoch % 5 == 0:
        torch.save(model, f'{ckpt_path}/model.pt')

In [None]:
torch.save(model, f'{ckpt_path}/model.pt')

In [None]:
test_dataset = test_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

In [None]:
model.eval()

all_ground_truth = []
all_predictions = []

with torch.no_grad():
    for i, batch in enumerate(tqdm(test_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch)[1]
        predicted_class_ids = logits.argmax(dim=-1)
        all_ground_truth += batch['labels'].cpu().detach().numpy().tolist()
        all_predictions += predicted_class_ids.cpu().detach().numpy().tolist()

## Evaluation

In [None]:
!pip install evaluate

In [None]:
import evaluate

In [None]:
accuracy_metric = evaluate.load("accuracy")
results = accuracy_metric.compute(references=all_ground_truth, predictions=all_predictions)
print(results)

## References

1. Spacy - https://spacy.io/  
2. Tokenization: https://neptune.ai/blog/tokenization-in-nlp  
3. HF tokenizers: https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb  
4. GLUE https://openreview.net/pdf?id=rJ4km2R5t7  
5. https://huggingface.co/docs/datasets/quickstart  