In [None]:
!pip install transformers >> /dev/null

### Models classes 

- have the methods for loading model from hub / or from directory

- resize the input token embeddings when new tokens are added

- prune the attention heads of the model

- Mixins with Additional Methods: 
    
    > ModuleUtilMixin (pytorch models)
    
    > GenerationMixin (Generation models)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# loading the model 
from transformers import AutoModel, AutoTokenizer
import numpy as np

model = AutoModel.from_pretrained("bert-base-cased")

model.add_model_tags(["custom", "custom_bert"])

In [None]:
model.push_to_hub("custom_bert")

In [None]:
model.can_generate()

In [None]:
model = AutoModel.from_pretrained("bert-base-cased", output_attentions=True)

In [None]:
assert model.config.output_attentions == True

**low_cpu_mem_usage algorithm:**

This is an experimental function that loads the model using ~1x model size CPU memory

Here is how it works:

- save which state_dict keys we have

- drop state_dict before the model is created, since the latter takes 1x model size CPU memory

- after the model has been instantiated switch to the meta device all params/buffers that are going to be replaced from the loaded state_dict

- load state_dict 2nd time

- replace the params/buffers from the state_dict

- there is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded



<> torch_dtype (str or torch.dtype, optional) — Override the default torch.dtype and load the model under a specific dtype. 

> torch.float16 or torch.bfloat16 or torch.float: load in a specified dtype, ignoring the model’s config.torch_dtype if one exists. If not specified


> "auto" - A torch_dtype entry in the config.json file of the model will be attempted to be used. 

<> device_map  — A map that specifies where each submodule should go. If we only pass the device (e.g., "cpu", "cuda:1", "mps", or a GPU ordinal rank like 1) on which the model will be allocated, the device map will map the entire model to this device. 

> Passing device_map = 0 means put the whole model on GPU 0.

In [None]:
# provides the size of the model
model.get_memory_footprint(return_buffers=True)

In [None]:
model.get_output_embeddings()  # no output

In [None]:
model.get_input_embeddings()

In [None]:
for name, parm in model.named_parameters():
    print(name, parm)

In [None]:
!pip install optimum >> /dev/null

In [None]:
pymodel = model.to_bettertransformer()

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# will throw torch not compiled with CUDA if there is no GPU enabled

model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-base",
                                                   torch_dtype=torch.float16,
                                                   device_map="auto")

In [None]:
tokeniser = T5Tokenizer.from_pretrained("google-t5/t5-base")

In [None]:
model.hf_device_map

In [None]:
test_sent = "This sentence is used for testing"
input = tokeniser(test_sent)
ids = input['input_ids']
ids

In [None]:
input

getting the error AttributeError: 'list' object has no attribute 'numel'

In [None]:
model.estimate_tokens(input)

In [None]:
model.floating_point_ops(input)

In [None]:
model.num_parameters(only_trainable=False, exclude_embeddings=False)

### Generation Configuration

Class that holds a configuration for a generation task. A generate call supports the following generation methods for text-decoder, text-to-text, speech-to-text, and vision-to-text models:

- greedy decoding by calling greedy_search() if num_beams=1 and do_sample=False

- contrastive search by calling contrastive_search() if penalty_alpha>0. and top_k>1

- multinomial sampling by calling sample() if num_beams=1 and do_sample=True

- beam-search decoding by calling beam_search() if num_beams>1 and do_sample=False

- beam-search multinomial sampling by calling beam_sample() if num_beams>1 and do_sample=True

- diverse beam-search decoding by calling group_beam_search(), if num_beams>1 and num_beam_groups>1

- constrained beam-search decoding by calling constrained_beam_search(), if constraints!=None or force_words_ids!=None

- assisted decoding by calling assisted_decoding(), if assistant_model is passed to .generate()

You do not need to call any of the above methods directly. Pass custom parameter values to ‘.generate()‘. To learn more about decoding strategies refer to the text generation strategies guide

In [None]:
from transformers import GenerationConfig

generation_config = GenerationConfig.from_pretrained("gpt2", top_k=1, do_sample=True,
                                                    return_unused_kwargs=True)

In [None]:
generation_config

In [None]:
from transformers import GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer

tokeniser = GPT2Tokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokeniser.pad_token_id = tokeniser.eos_token_id

In [None]:
# prompt... ing

test = 'Today is'

inputs = tokeniser([test], return_tensors='pt')
inputs

In [None]:
# Example 1: print the scores of output, generated with greedy search

outputs = model.generate(**inputs,
                         max_new_tokens=5,
                         return_dict_in_generate=True,
                        output_scores=True)

In [None]:
outputs

Computes the **transition scores** of sequences given the generation scores (and beam indices, if beam search was used). This is a convenient method to quicky obtain the scores of the selected tokens at generation time

In [None]:
trans_scores = model.compute_transition_scores(outputs.sequences,
                                              outputs.scores, normalize_logits=True)

In [None]:
trans_scores

In [None]:
# input_length is the length of the input prompt for decoder-only models,
# like the GPT family, and 1 for encoder-decoder family like Bart / T5

input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]

In [None]:
input_length

In [None]:
gen_tokens = outputs.sequences[:, input_length:]
gen_tokens

In [None]:
for tok, score in zip(gen_tokens[0], trans_scores[0]):
    print(f"| {tok:5d} | {tokeniser.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

In [None]:
# example 2 with beam search

outputs = model.generate(**inputs, max_new_tokens=5, num_beams=4,
                        num_return_sequences=4, return_dict_in_generate=True,
                        output_scores=True)

In [None]:
trans_scores = model.compute_transition_scores(outputs.sequences, outputs.scores,
                                              outputs.beam_indices, normalize_logits=False)

In [None]:
output_length = np.sum(trans_scores.numpy() < 0, axis=1)

length_penalty = model.generation_config.length_penalty

reconstructed_scores = trans_scores.sum(axis=1) / (output_length**length_penalty)

print(np.allclose(outputs.sequences_scores, reconstructed_scores))


In [None]:
# greedy decoding and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

In [None]:
from transformers import (
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    StoppingCriteriaList,
    MaxLengthCriteria
)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModel.from_pretrained("gpt2")

In [None]:
print(model.generation_config)
# model.generation_config.pad_token_id = model.generation_config.eos_token_id

print(model.config)

In [None]:
generation_config = GenerationConfig.from_pretrained("gpt2", top_k=1, do_sample=True,
                                                    return_unused_kwargs=True)

In [None]:
# calling to_dict fills up whole lot of parameters
generation_config[0].to_dict()

In [None]:
# following was done to get greedy search work
# model.generation_config = {"pad_token_id": model.config.eos_token_id} # not working
# model.generation_config.pad_token_id = model.config.eos_token_id

In [None]:
input_prompt = "It might be possible to"
input_ids = tokenizer(input_prompt, return_tensors='pt')

In [None]:
logits_processor = LogitsProcessorList(
    [
        MinLengthLogitsProcessor(10, eos_token_id=model.config.eos_token_id)
    ]
)

In [None]:
stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])

In [None]:
# greedy search did not execute, as the model was not having the 
# generation config
outputs = model.greedy_search(
    input_ids['input_ids'],
    logits_processor=logits_processor,
    stopping_criteria=stopping_criteria,
    pad_token_id = model.config.eos_token_id,
    eos_token_id = model.config.eos_token_id,
    output_scores = True,
    max_new_tokens=5,
    num_return_sequences=4,
    return_dict_in_generate=True,
    output_attentions = False,
    output_hidden_states = False
)

In [None]:
 # multinomial sampling and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    TopKLogitsWarper,
    TemperatureLogitsWarper,
    StoppingCriteriaList,
    MaxLengthCriteria,
)
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [None]:
# set pad_token_id to eos_token_id because GPT2 does not have a EOS token
model.config.pad_token_id = model.config.eos_token_id
model.generation_config.pad_token_id = model.config.eos_token_id

input_prompt = "Today is a beautiful day, and"
input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids

In [None]:
model.generation_config

In [None]:
# instantiate logits processors
logits_processor = LogitsProcessorList(
    [
        MinLengthLogitsProcessor(15, 
                                 eos_token_id=model.generation_config.eos_token_id),
    ]
)
# instantiate logits processors
logits_warper = LogitsProcessorList(
    [
        TopKLogitsWarper(50),
        TemperatureLogitsWarper(0.7),
    ]
)

stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])


In [None]:
torch.manual_seed(0)
# uses multinomial sampling to generate the sequences

outputs = model.sample(
    input_ids,
    logits_processor=logits_processor,
    logits_warper=logits_warper,  # this is added in sample method
    stopping_criteria=stopping_criteria,
)

tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
# Beam search is used for  used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

In [None]:
from transformers import (
    BeamSearchScorer, # this is new class in Beam Search
    AutoModelForSeq2SeqLM
)

tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

encoder_input_str = "translate English to German: How young are you?"
encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids


# lets run beam search using 3 beams
num_beams = 3

# define decoder start token ids
input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
input_ids = input_ids * model.config.decoder_start_token_id

# add encoder_outputs to model keyword arguments
model_kwargs = {
    "encoder_outputs": model.get_encoder()(
        encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
    )
}

# instantiate beam scorer
beam_scorer = BeamSearchScorer(
    batch_size=1,
    num_beams=num_beams,
    device=model.device,
)

# instantiate logits processors
logits_processor = LogitsProcessorList(
    [
        MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
    ]
)

outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)


In [None]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
# Beam search multinomial sample is used for  used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

In [None]:
encoder = model.get_encoder()
encoder

In [None]:
decoder = model.get_decoder()

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    TopKLogitsWarper,
    TemperatureLogitsWarper,
    BeamSearchScorer,
)
import torch

tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

encoder_input_str = "translate English to German: How young are you?"
encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids

# lets run beam search using 3 beams
num_beams = 3

# define decoder start token ids
input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
input_ids = input_ids * model.config.decoder_start_token_id

# add encoder_outputs to model keyword arguments
model_kwargs = {
    "encoder_outputs": model.get_encoder()(
        encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
    )
}

# instantiate beam scorer
beam_scorer = BeamSearchScorer(
    batch_size=1,
    max_length=model.config.max_length,
    num_beams=num_beams,
    device=model.device,
)

# instantiate logits processors
logits_processor = LogitsProcessorList(
    [MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)]
)
# instantiate logits processors
logits_warper = LogitsProcessorList(
    [
        TopKLogitsWarper(50),
        TemperatureLogitsWarper(0.7),
    ]
)

outputs = model.beam_sample(
    input_ids,
    beam_scorer,
    logits_processor=logits_processor,
    logits_warper=logits_warper,
    **model_kwargs
)

tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
# contrastive search used for text-decoder, text-to-text, speech-to-text, and vision-to-text models
# constrained beam search decoding and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
# diverse beam search decoding and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

### Model Outputs

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
torch.tensor([1]).unsqueeze(0)

tensor([[1]])

In [3]:
inputs = tokenizer("Hello, this is a superb day.", return_tensors='pt')
labels = torch.tensor([1]).unsqueeze(0)
outputs = model(**inputs, labels=labels)  # why labels are passed?

In [4]:
outputs

SequenceClassifierOutput(loss=tensor(0.3689, grad_fn=<NllLossBackward0>), logits=tensor([[-0.2489,  0.5581]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [5]:
outputs_wolabel = model(**inputs)
outputs_wolabel # will not have loss, as the labels are not passed

SequenceClassifierOutput(loss=None, logits=tensor([[-0.2489,  0.5581]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [6]:
outputs = model(**inputs,
                labels=labels,
                output_hidden_states=True,
                output_attentions=True)
outputs

SequenceClassifierOutput(loss=tensor(0.3689, grad_fn=<NllLossBackward0>), logits=tensor([[-0.2489,  0.5581]], grad_fn=<AddmmBackward0>), hidden_states=(tensor([[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
           3.8253e-02,  1.6400e-01],
         [ 3.7386e-01, -1.5575e-02, -2.4561e-01,  ..., -3.1657e-02,
           5.5144e-01, -5.2406e-01],
         [ 4.6704e-04,  1.6225e-01, -6.4443e-02,  ...,  4.9443e-01,
           6.9413e-01,  3.6286e-01],
         ...,
         [-2.6303e-01,  1.4989e-01,  1.8093e-01,  ...,  2.4644e-01,
           8.5299e-03, -6.3424e-01],
         [-1.5500e-01,  6.9230e-02, -1.6601e-01,  ...,  4.3867e-01,
           6.4413e-01,  5.9384e-01],
         [-1.4736e-01, -4.1137e-02, -7.3157e-02,  ..., -1.1568e-01,
           4.2107e-02, -5.4994e-02]]], grad_fn=<NativeLayerNormBackward0>), tensor([[[ 0.0605,  0.0289, -0.1973,  ...,  0.2396, -0.1291, -0.0037],
         [ 0.3784,  0.0968,  0.2672,  ...,  0.0520,  0.5898, -0.3454],
         [-0.3904,  0.3

In [7]:
outputs[:2]

(tensor(0.3689, grad_fn=<NllLossBackward0>),
 tensor([[-0.2489,  0.5581]], grad_fn=<AddmmBackward0>))

In [10]:
outputs = model(**inputs,
                labels=labels,
                output_hidden_states=True,
                output_attentions=True,)
tuple_out = outputs.to_tuple()