In [None]:
! pip install transformers datasets evaluate

In [2]:
import torch
import torch.nn as nn

import transformers

from transformers import pipeline
from datasets import load_dataset

import random

## DistilBert for Language Modeling

In [None]:
MODEL_TYPE = 'distilbert-base-uncased'

tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_TYPE)
model = transformers.AutoModel.from_pretrained(MODEL_TYPE)
print(f"# DistilBert Parameters: {round(model.num_parameters() / 1_000_000)}M")

text = "NLP2.0 is my favorite lecture"
encoded_input = tokenizer(text, return_tensors='pt')

output = model(**encoded_input)

BERT operates on wordpiece level.

In [None]:
tokenizer.tokenize(text)

['nl', '##p', '##2', '.', '0', 'is', 'my', 'favorite', 'lecture']

By default, the base model only contains the last hidden state as output.

In [None]:
last_hidden_state = output.last_hidden_state
last_hidden_state.shape # shape: [1, 11, 768]

torch.Size([1, 11, 768])

## Masked Language Modeling



In [11]:
MODEL_TYPE = 'distilbert-base-uncased'
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_TYPE)
model = transformers.AutoModelForMaskedLM.from_pretrained(MODEL_TYPE)

text = "The new movie was [MASK]."
inputs = tokenizer(text, return_tensors="pt")
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

logits = model(**inputs).logits
mask_token_logits = logits[0, mask_token_index, :]

top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))


The new movie was cancelled.
The new movie was filmed.
The new movie was released.


## Language Generation
While the simple BERT variants can only produce one token at a time, there exist approaches that try to create multiple token at once. However, due to its bi-directional nature, encoder architectures perform worse than autoregressive models.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_TYPE = "gpt2-large"

tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)
model = AutoModelForCausalLM.from_pretrained(MODEL_TYPE)

prompt = "Today was an amazing day because"
inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(**inputs, do_sample=True, max_new_tokens=100)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Today was an amazing day because I was the first to be in the room with the doctors, and what I heard was the most wonderful thing I had ever heard. It was a great day to feel normal."\n\nThis story originally appeared on GSN and was republished with permission from TIME.\n\nMORE READING\n\nIs this who Hillary Clinton really is? It is hard to tell and difficult to prove.\n\nAfter \'deplorables\': Clinton campaign turns to \'alt-right\' to defeat Trump']

## Pipeline as Alternative

In [None]:
text = "I want to eat [MASK]."
mask_filler = transformers.pipeline("fill-mask", "distilbert-base-uncased")
mask_filler(text, top_k=3)

[{'score': 0.03075120598077774,
  'token': 6350,
  'token_str': 'breakfast',
  'sequence': 'i want to eat breakfast.'},
 {'score': 0.02877492643892765,
  'token': 2242,
  'token_str': 'something',
  'sequence': 'i want to eat something.'},
 {'score': 0.02485204115509987,
  'token': 2009,
  'token_str': 'it',
  'sequence': 'i want to eat it.'}]

In [None]:
text = "Hugging Face is a community-based open-source platform for machine learning."
generator = transformers.pipeline("text-generation", "t5-small")
generator(text)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'Maria

[{'generated_text': 'Hugging Face is a community-based open-source platform for machine learning. learning.'}]

## Fine-Tune BERT on Next Sentence Prediction

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
example = tokenized_datasets['train'][0]
print(example['sentence1'])
print(example['sentence2'])
print(example['label']) # 1 means that sentence2 is the true next sentence of sentence1


Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .
Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .
1


In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
b = {k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Custom Model Head for Sequence Classification

In [None]:
import torch.nn as nn
from transformers import AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss

class MyBERTModel(nn.Module):
    def __init__(self, is_frozen=True):
        super(MyBERTModel, self).__init__()

        self.num_labels = 2
        checkpoint = 'bert-base-uncased'
        self.base_model = AutoModel.from_pretrained(checkpoint)

        if is_frozen:
          self.freeze()

        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(768, 2) # output features from bert is 768 and 2 is ur number of labels

    def freeze(self):
      for param in self.base_model.parameters():
        param.requires_grad = False

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        outputs = self.base_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        outputs = self.dropout(outputs[1])
        logits = self.linear(outputs)

        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        #return outputs, loss
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )

model = MyBERTModel()
model(**{'input_ids':batch['input_ids'], 'labels':batch['labels'],  'token_type_ids':batch['token_type_ids'], 'attention_mask':batch['attention_mask']})

# Note: this is equivalent to using the AutoModelForSequenceClassification class
'''
from transformers import AutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
'''


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


"\nfrom transformers import AutoModelForSequenceClassification\n\ncheckpoint = 'distilbert-base-uncased'\nmodel = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n"

In [None]:
model(**batch)

SequenceClassifierOutput(loss=tensor(0.8143, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6680,  0.8221],
        [ 0.1438, -0.1971],
        [ 0.9690,  0.7177],
        [-0.7547,  0.2342],
        [ 0.4335,  0.0127],
        [ 0.7449, -0.1288],
        [ 0.1940,  0.6175],
        [ 0.7134,  0.1328]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device



1377


device(type='cuda')

In [None]:
import evaluate

def eval(model, loader):

  metric = evaluate.load("glue", "mrpc")
  model.eval()
  for batch in loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
          outputs = model(**batch)


      logits = outputs.logits
      predictions = torch.argmax(logits, dim=-1)
      metric.add_batch(predictions=predictions, references=batch["labels"])

  return metric.compute()

eval(model, eval_dataloader)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

### Training Loop

In [None]:
from tqdm import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()

        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|█████████▉| 1376/1377 [00:51<00:00, 26.03it/s]

In [None]:
eval(model, eval_dataloader)

{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

## Zero-Shot Classification via Prompting




In [None]:
classifier = pipeline("zero-shot-classification")

### Topic Modeling

In [4]:
sequence = "Who are you voting for in 2020?"
candidate_labels = ["politics", "public health", "economics"]

classifier(sequence, candidate_labels)

{'sequence': 'Who are you voting for in 2020?',
 'labels': ['politics', 'economics', 'public health'],
 'scores': [0.9725188612937927, 0.014584165997803211, 0.012896993197500706]}

In [5]:
sequence = "Who is more likely to live in a city?"
candidate_labels = ["sailor", "farmer", "mayor"]

classifier(sequence, candidate_labels)

{'sequence': 'Who is more likely to live in a city?',
 'labels': ['mayor', 'sailor', 'farmer'],
 'scores': [0.5189307332038879, 0.30701252818107605, 0.1740567982196808]}

### Sentiment Classification

In [6]:
sequences = [
    "I hated this movie. The acting sucked.",
    "This movie didn't quite live up to my high expectations, but overall I still really enjoyed it."
]
candidate_labels = ["positive", "negative"]

classifier(sequences, candidate_labels)

[{'sequence': 'I hated this movie. The acting sucked.',
  'labels': ['negative', 'positive'],
  'scores': [0.9916267991065979, 0.00837322324514389]},
 {'sequence': "This movie didn't quite live up to my high expectations, but overall I still really enjoyed it.",
  'labels': ['negative', 'positive'],
  'scores': [0.8148518800735474, 0.18514816462993622]}]

## In-Context Learning

In [None]:
rotten_tomatoes = load_dataset("rotten_tomatoes")

In [8]:
subset = rotten_tomatoes['train']
examples = [
    {'text': "i can analyze this movie in three words : thumbs friggin' down .", 'label': 0},
    {'text': "sadly , 'garth' hasn't progressed as nicely as 'wayne . '", 'label': 0},
    {'text': 'make like the title and dodge this one .', 'label': 0},
    {'text': 'constantly touching , surprisingly funny , semi-surrealist exploration of the creative act .', 'label': 1},
    {'text': 'the journey is worth your time , especially if you have ellen pompeo sitting next to you for the ride .', 'label': 1},
    {'text': 'merci pour le movie .', 'label': 1}
]

# alternative
# examples = random.choices(subset, k=6)
print(examples)

test = {'text': 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .',
 'label': 1}

TEMPLATE = lambda x: x + " Overall, it was [MASK]. "

# for simplicity, sticking to a simple verbalizer so that the model has an easier time using the correct labl
# VERBALIZER = {1: ["great", "good", "wonderful", "perfect"], 0: ["bad", "terrible", "horrible"]}
VERBALIZER = {1: ["good"], 0: ["bad"]}

def verbalize(label):
  return random.choice(VERBALIZER[example['label']])

PATTERN = lambda x: "Review: " + x

prompt = ""

for example in examples:
  out = PATTERN(example['text'])
  out = TEMPLATE(out)
  out = out.replace("[MASK]", verbalize(example['label']))
  prompt += out

prompt += TEMPLATE(PATTERN(test['text']))

ref = 'Review: i can analyze this movie in three words : thumbs friggin\' down . Overall, it was bad. Review: sadly , \'garth\' hasn\'t progressed as nicely as \'wayne . \' Overall, it was bad. Review: make like the title and dodge this one . Overall, it was bad. Review: constantly touching , surprisingly funny , semi-surrealist exploration of the creative act . Overall, it was good. Review: the journey is worth your time , especially if you have ellen pompeo sitting next to you for the ride . Overall, it was good. Review: merci pour le movie . Overall, it was good. Review: if you sometimes like to go to the movies to have fun , wasabi is a good place to start . Overall, it was [MASK]. '

assert ref == prompt, 'ref and prompt do not match '

prompt

[{'text': "i can analyze this movie in three words : thumbs friggin' down .", 'label': 0}, {'text': "sadly , 'garth' hasn't progressed as nicely as 'wayne . '", 'label': 0}, {'text': 'make like the title and dodge this one .', 'label': 0}, {'text': 'constantly touching , surprisingly funny , semi-surrealist exploration of the creative act .', 'label': 1}, {'text': 'the journey is worth your time , especially if you have ellen pompeo sitting next to you for the ride .', 'label': 1}, {'text': 'merci pour le movie .', 'label': 1}]


"Review: i can analyze this movie in three words : thumbs friggin' down . Overall, it was bad. Review: sadly , 'garth' hasn't progressed as nicely as 'wayne . ' Overall, it was bad. Review: make like the title and dodge this one . Overall, it was bad. Review: constantly touching , surprisingly funny , semi-surrealist exploration of the creative act . Overall, it was good. Review: the journey is worth your time , especially if you have ellen pompeo sitting next to you for the ride . Overall, it was good. Review: merci pour le movie . Overall, it was good. Review: if you sometimes like to go to the movies to have fun , wasabi is a good place to start . Overall, it was [MASK]. "

In [9]:
mask_filler = transformers.pipeline("fill-mask", "distilbert-base-uncased")
mask_filler(prompt, top_k=1)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'score': 0.5967800617218018,
  'token': 2204,
  'token_str': 'good',
  'sequence': "review : i can analyze this movie in three words : thumbs friggin'down. overall, it was bad. review : sadly,'garth'hasn't progressed as nicely as'wayne.'overall, it was bad. review : make like the title and dodge this one. overall, it was bad. review : constantly touching, surprisingly funny, semi - surrealist exploration of the creative act. overall, it was good. review : the journey is worth your time, especially if you have ellen pompeo sitting next to you for the ride. overall, it was good. review : merci pour le movie. overall, it was good. review : if you sometimes like to go to the movies to have fun, wasabi is a good place to start. overall, it was good."}]