**Motivation:** 

During the practice session with tokenizers, and various Models of AutoModel Instances, I got stuck when trying to convert the raw model output using the tokenizer.decode(). Which made me realize that, depending on the model head, decoding strategy will differ.  

In [77]:
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
    AutoModelForMaskedLM,
    AutoModelForQuestionAnswering, 
    AutoModelForMultipleChoice
)
import torch
# The above classes provide different heads to the underlying model, 

In [75]:
task_arch = {
    "text_classification":{
        "architectures":['ALBERT','BART',
 'BERT',
 'BigBird',
 'BigBird-Pegasus',
 'BioGpt',
 'BLOOM',
 'CamemBERT',
 'CANINE',
 'CodeLlama',
 'ConvBERT',
 'CTRL',
 'Data2VecText',
 'DeBERTa',
 'DeBERTa-v2',
 'DistilBERT',
 'ELECTRA',
 'ERNIE',
 'ErnieM',
 'ESM',
 'Falcon',
 'FlauBERT',
 'FNet',
 'Funnel Transformer',
 'GPT-Sw3',
 'OpenAI GPT-2',
 'GPTBigCode',
 'GPT Neo',
 'GPT NeoX',
 'GPT-J',
 'I-BERT',
 'LayoutLM',
 'LayoutLMv2',
 'LayoutLMv3',
 'LED',
 'LiLT',
 'LLaMA',
 'Longformer',
 'LUKE',
 'MarkupLM',
 'mBART',
 'MEGA',
 'Megatron-BERT',
 'Mistral',
 'Mixtral',
 'MobileBERT',
 'MPNet',
 'MPT',
 'MRA',
 'MT5',
 'MVP',
 'Nezha',
 'Nyströmformer',
 'OpenLlama',
 'OpenAI GPT',
 'OPT',
 'Perceiver',
 'Persimmon',
 'Phi',
 'PLBart',
 'QDQBert',
 'Qwen2',
 'Reformer',
 'RemBERT',
 'RoBERTa',
 'RoBERTa-PreLayerNorm',
 'RoCBert',
 'RoFormer',
 'SqueezeBERT',
 'T5',
 'TAPAS',
 'Transformer-XL',
 'UMT5',
 'XLM',
 'XLM-RoBERTa',
 'XLM-RoBERTa-XL',
 'XLNet',
 'X-MOD',
 'YOSO'],
        "AutoModelClass": "AutoModelForSequenceClassification",
        "dataset": "imdb",
        "model_used": "distilbert-base-uncased",
    },
    "token_classification":{
        "architectures":['ALBERT',
 'BERT',
 'BigBird',
 'BioGpt',
 'BLOOM',
 'BROS',
 'CamemBERT',
 'CANINE',
 'ConvBERT',
 'Data2VecText',
 'DeBERTa',
 'DeBERTa-v2',
 'DistilBERT',
 'ELECTRA',
 'ERNIE',
 'ErnieM',
 'ESM',
 'Falcon',
 'FlauBERT',
 'FNet',
 'Funnel Transformer',
 'GPT-Sw3',
 'OpenAI GPT-2',
 'GPTBigCode',
 'GPT Neo',
 'GPT NeoX',
 'I-BERT',
 'LayoutLM',
 'LayoutLMv2',
 'LayoutLMv3',
 'LiLT',
 'Longformer',
 'LUKE',
 'MarkupLM',
 'MEGA',
 'Megatron-BERT',
 'MobileBERT',
 'MPNet',
 'MPT',
 'MRA',
 'MT5',
 'Nezha',
 'Nyströmformer',
 'Phi',
 'QDQBert',
 'RemBERT',
 'RoBERTa',
 'RoBERTa-PreLayerNorm',
 'RoCBert',
 'RoFormer',
 'SqueezeBERT',
 'T5',
 'UMT5',
 'XLM',
 'XLM-RoBERTa',
 'XLM-RoBERTa-XL',
 'XLNet',
 'X-MOD',
 'YOSO'],
        "AutoModelClass": "AutoModelForTokenClassification",
        "dataset": "wnut_17",
        "model_used": "distilbert-base-uncased",
    },
    "question_answering":{
        "architectures":['ALBERT',
 'BART',
 'BERT',
 'BigBird',
 'BigBird-Pegasus',
 'BLOOM',
 'CamemBERT',
 'CANINE',
 'ConvBERT',
 'Data2VecText',
 'DeBERTa',
 'DeBERTa-v2',
 'DistilBERT',
 'ELECTRA',
 'ERNIE',
 'ErnieM',
 'Falcon',
 'FlauBERT',
 'FNet',
 'Funnel Transformer',
 'OpenAI GPT-2',
 'GPT Neo',
 'GPT NeoX',
 'GPT-J',
 'I-BERT',
 'LayoutLMv2',
 'LayoutLMv3',
 'LED',
 'LiLT',
 'LLaMA',
 'Longformer',
 'LUKE',
 'LXMERT',
 'MarkupLM',
 'mBART',
 'MEGA',
 'Megatron-BERT',
 'MobileBERT',
 'MPNet',
 'MPT',
 'MRA',
 'MT5',
 'MVP',
 'Nezha',
 'Nyströmformer',
 'OPT',
 'QDQBert',
 'Reformer',
 'RemBERT',
 'RoBERTa',
 'RoBERTa-PreLayerNorm',
 'RoCBert',
 'RoFormer',
 'Splinter',
 'SqueezeBERT',
 'T5',
 'UMT5',
 'XLM',
 'XLM-RoBERTa',
 'XLM-RoBERTa-XL',
 'XLNet',
 'X-MOD',
 'YOSO'],
        "AutoModelClass": "AutoModelForQuestionAnswering",
        "dataset": "squad",
        "model_used": "distilbert-base-uncased",
    },
    "causal_lm":{
        "architectures": ['BART',
 'BERT',
 'Bert Generation',
 'BigBird',
 'BigBird-Pegasus',
 'BioGpt',
 'Blenderbot',
 'BlenderbotSmall',
 'BLOOM',
 'CamemBERT',
 'CodeLlama',
 'CodeGen',
 'CPM-Ant',
 'CTRL',
 'Data2VecText',
 'ELECTRA',
 'ERNIE',
 'Falcon',
 'Fuyu',
 'GIT',
 'GPT-Sw3',
 'OpenAI GPT-2',
 'GPTBigCode',
 'GPT Neo',
 'GPT NeoX',
 'GPT NeoX Japanese',
 'GPT-J',
 'LLaMA',
 'Marian',
 'mBART',
 'MEGA',
 'Megatron-BERT',
 'Mistral',
 'Mixtral',
 'MPT',
 'MusicGen',
 'MVP',
 'OpenLlama',
 'OpenAI GPT',
 'OPT',
 'Pegasus',
 'Persimmon',
 'Phi',
 'PLBart',
 'ProphetNet',
 'QDQBert',
 'Qwen2',
 'Reformer',
 'RemBERT',
 'RoBERTa',
 'RoBERTa-PreLayerNorm',
 'RoCBert',
 'RoFormer',
 'RWKV',
 'Speech2Text2',
 'Transformer-XL',
 'TrOCR',
 'Whisper',
 'XGLM',
 'XLM',
 'XLM-ProphetNet',
 'XLM-RoBERTa',
 'XLM-RoBERTa-XL',
 'XLNet',
 'X-MOD'],
        "AutoModelClass":"AutoModelForCausalLM",
        "dataset": "eli5_category",
        "model_used": "distilgpt2",
    },
    "masked_lm":{
        "architectures": ['ALBERT',
 'BART',
 'BERT',
 'BigBird',
 'CamemBERT',
 'ConvBERT',
 'Data2VecText',
 'DeBERTa',
 'DeBERTa-v2',
 'DistilBERT',
 'ELECTRA',
 'ERNIE',
 'ESM',
 'FlauBERT',
 'FNet',
 'Funnel Transformer',
 'I-BERT',
 'LayoutLM',
 'Longformer',
 'LUKE',
 'mBART',
 'MEGA',
 'Megatron-BERT',
 'MobileBERT',
 'MPNet',
 'MRA',
 'MVP',
 'Nezha',
 'Nyströmformer',
 'Perceiver',
 'QDQBert',
 'Reformer',
 'RemBERT',
 'RoBERTa',
 'RoBERTa-PreLayerNorm',
 'RoCBert',
 'RoFormer',
 'SqueezeBERT',
 'TAPAS',
 'Wav2Vec2',
 'XLM',
 'XLM-RoBERTa',
 'XLM-RoBERTa-XL',
 'X-MOD',
 'YOSO'],
        "AutoModelClass": "AutoModelForMaskedLM",
        "dataset": "eli-5",
        "model_used": "distilroberta-base",
    },
    "translation":{
        "architectures": ['BART',
 'BigBird-Pegasus',
 'Blenderbot',
 'BlenderbotSmall',
 'Encoder decoder',
 'FairSeq Machine-Translation',
 'GPTSAN-japanese',
 'LED',
 'LongT5',
 'M2M100',
 'Marian',
 'mBART',
 'MT5',
 'MVP',
 'NLLB',
 'NLLB-MOE',
 'Pegasus',
 'PEGASUS-X',
 'PLBart',
 'ProphetNet',
 'SeamlessM4T',
 'SeamlessM4Tv2',
 'SwitchTransformers',
 'T5',
 'UMT5',
 'XLM-ProphetNet'],
        "AutoModelClass": "AutoModelForSeq2SeqLM",
        "dataset": "opus_books",
        "model_used": "t5-small",
    },
    "summarization":{
        "architectures": ['BART',
 'BigBird-Pegasus',
 'Blenderbot',
 'BlenderbotSmall',
 'Encoder decoder',
 'FairSeq Machine-Translation',
 'GPTSAN-japanese',
 'LED',
 'LongT5',
 'M2M100',
 'Marian',
 'mBART',
 'MT5',
 'MVP',
 'NLLB',
 'NLLB-MOE',
 'Pegasus',
 'PEGASUS-X',
 'PLBart',
 'ProphetNet',
 'SeamlessM4T',
 'SeamlessM4Tv2',
 'SwitchTransformers',
 'T5',
 'UMT5',
 'XLM-ProphetNet'],
        "AutoModelClass": "AutoModelForSeq2SeqLM",
        "dataset": "billsum",
        "model_used": "t5-small",
    },
    "multiple_choice":{
        "architectures": ['ALBERT',
 'BERT',
 'BigBird',
 'CamemBERT',
 'CANINE',
 'ConvBERT',
 'Data2VecText',
 'DeBERTa-v2',
 'DistilBERT',
 'ELECTRA',
 'ERNIE',
 'ErnieM',
 'FlauBERT',
 'FNet',
 'Funnel Transformer',
 'I-BERT',
 'Longformer',
 'LUKE',
 'MEGA',
 'Megatron-BERT',
 'MobileBERT',
 'MPNet',
 'MRA',
 'Nezha',
 'Nyströmformer',
 'QDQBert',
 'RemBERT',
 'RoBERTa',
 'RoBERTa-PreLayerNorm',
 'RoCBert',
 'RoFormer',
 'SqueezeBERT',
 'XLM',
 'XLM-RoBERTa',
 'XLM-RoBERTa-XL',
 'XLNet',
 'X-MOD',
 'YOSO'],
        "AutoModelClass": "AutoModelForMultipleChoice",
        "dataset": "swag",
        "model_used": "bert-base-uncased",
    },
}

In [76]:
import json
# Collecting the model architectures according to the tasks
with open('task_arch.json', 'w') as mod:
    json.dump(task_arch, mod)

### Seq2Seq Classification

In [3]:
distilbert_cls_path = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(distilbert_cls_path)

In [4]:
distilbert_model = AutoModelForSequenceClassification.from_pretrained(distilbert_cls_path)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
classify_text = "This is a very nice way of getting things done"
text_tokened = tokenizer(classify_text, return_tensors='pt')

In [6]:
id2Label = {0:'negative', 1:'positive'}
label2id = {'negative':0, 'positive':0}

In [10]:
classification_logits = distilbert_model(**text_tokened).logits
predicted_class_id = classification_logits.argmax().item()
id2Label[predicted_class_id]

'negative'

### Token Classification

In [11]:
# token classification labels are part of the datasets
from datasets import load_dataset
wnut = load_dataset('wnut_17')

Downloading builder script:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading and preparing dataset wnut_17/wnut_17 (download: 782.18 KiB, generated: 1.66 MiB, post-processed: Unknown size, total: 2.43 MiB) to /root/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/66.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3394 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1009 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1287 [00:00<?, ? examples/s]

Dataset wnut_17 downloaded and prepared to /root/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
label_list = wnut['train'].features['ner_tags'].feature.names
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [13]:
distilbert_tokencls_path = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(distilbert_tokencls_path)

In [15]:
wnut['train'][0]['tokens']

['@paulwalk',
 'It',
 "'s",
 'the',
 'view',
 'from',
 'where',
 'I',
 "'m",
 'living',
 'for',
 'two',
 'weeks',
 '.',
 'Empire',
 'State',
 'Building',
 '=',
 'ESB',
 '.',
 'Pretty',
 'bad',
 'storm',
 'here',
 'last',
 'evening',
 '.']

In [16]:
example = wnut['train'][0]
tokened_input = tokenizer(example['tokens'], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokened_input['input_ids'])
tokens  # additional tokens added in beginning, and at end

['[CLS]',
 '@',
 'paul',
 '##walk',
 'it',
 "'",
 's',
 'the',
 'view',
 'from',
 'where',
 'i',
 "'",
 'm',
 'living',
 'for',
 'two',
 'weeks',
 '.',
 'empire',
 'state',
 'building',
 '=',
 'es',
 '##b',
 '.',
 'pretty',
 'bad',
 'storm',
 'here',
 'last',
 'evening',
 '.',
 '[SEP]']

In [21]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [20]:
tokened_input.word_ids(batch_index=0)

[None,
 0,
 0,
 0,
 1,
 2,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 None]

In [22]:
text = "The Golden State Warriors are an American professional basketball team based in San Francisco."

In [23]:
inputs = tokenizer(text, return_tensors='pt')

In [28]:

tokencls_model = AutoModelForTokenClassification.from_pretrained(distilbert_tokencls_path)

with torch.no_grad():
    logits = tokencls_model(**inputs).logits

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
predictions = torch.argmax(logits, dim=2)
predicted_token_class = [id2label[t.item()] for t in predictions[0]]
predicted_token_class

['O',
 'B-corporation',
 'B-corporation',
 'B-corporation',
 'O',
 'B-corporation',
 'B-corporation',
 'B-corporation',
 'B-corporation',
 'B-corporation',
 'B-corporation',
 'B-corporation',
 'B-corporation',
 'B-corporation',
 'B-corporation',
 'B-corporation',
 'B-corporation']

### Question Answering

In [32]:
distilbert_qa_path = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(distilbert_qa_path)
qa_model = AutoModelForQuestionAnswering.from_pretrained(distilbert_qa_path)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
question = "How many programming languages does BLOOM support?"
context = """BLOOM has 176 billion parameters and can 
generate text in 46 languages natural languages and 13 programming languages."""

In [34]:
inputs = tokenizer(question, context, return_tensors='pt')  
# question , context are a single tensor after tokenisation 

In [35]:
with torch.no_grad():
    outputs = qa_model(**inputs) 

In [36]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [38]:
inputs.input_ids[0]

tensor([[  101,  2129,  2116,  4730,  4155,  2515, 13426,  2490,  1029,   102,
         13426,  2038, 18561,  4551, 11709,  1998,  2064,  9699,  3793,  1999,
          4805,  4155,  3019,  4155,  1998,  2410,  4730,  4155,  1012,   102]])

In [39]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

''

### Causal Language Modeling

In [43]:
# Text Generation >> Uses the generate followed by tokenizer.decode

# model_tg = 'gpt2'
model_tg = 'distilgpt2'

model_tokenizer = AutoTokenizer.from_pretrained(model_tg)
causal_gpt = AutoModelForCausalLM.from_pretrained(model_tg)

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [45]:
# model_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 
# can't do this, as it will raise when decoding the model output
print(model_tokenizer.pad_token_id)
print(model_tokenizer.eos_token_id)

None
50256


In [46]:
model_tokenizer.pad_token_id = model_tokenizer.eos_token_id

In [47]:
sentence = "This is a seed sentence for enormous experiments that will be successful"
# tokened = model_tokenizer(sentence, padding='max_length', max_length=20, return_tensors='pt')
tokened = model_tokenizer(sentence, return_tensors='pt')

In [49]:
tokened

{'input_ids': tensor([[ 1212,   318,   257,  9403,  6827,   329,  9812, 10256,   326,   481,
           307,  4388]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [48]:
causal_gpt.can_generate()

True

In [50]:
model_logits = causal_gpt.generate(tokened['input_ids'],
                                  max_new_tokens=100, do_sample=True,
                                  top_p=0.95)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [51]:
# text generation output decoded 
text_output = model_tokenizer.decode(model_logits[0], skip_special_tokens=True)
text_output

'This is a seed sentence for enormous experiments that will be successful in a short time, which will allow us to test the impact that the experiment would have on the human genome, which is much faster than in a biological experiment.\n\n\n\nWe think that for the first time it will be a new way to learn about the impact of genes that are involved in a human genome.\nWe think that the next evolution will be the ability for other human cells to reproduce into other cells on the same size as our own.\nWe have seen how people use'

### Masked Language Modeling

In [53]:
model_masked_path = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_masked_path)
model_masked = AutoModelForMaskedLM.from_pretrained(model_masked_path)

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [56]:
text = "The Milky Way is a <mask> galaxy."
inputs = tokenizer(text, return_tensors='pt')
mask_token_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]

In [57]:
logits = model_masked(**inputs).logits
mask_token_logits = logits[0, mask_token_index, :]

In [58]:
mask_token_logits

tensor([[-2.0420, -3.1702,  3.7433,  ..., -1.2430, -2.4243,  1.4616]],
       grad_fn=<IndexBackward0>)

In [60]:
top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()
for tokens in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([tokens])))

2024-02-08 05:29:47.016871: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-08 05:29:47.017017: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-08 05:29:47.162271: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


The Milky Way is a  spiral galaxy.
The Milky Way is a  dwarf galaxy.
The Milky Way is a  massive galaxy.


### Translation

In [63]:
model_translation_path = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_translation_path)
model_translation = AutoModelForSeq2SeqLM.from_pretrained(model_translation_path)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [64]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

In [66]:
inputs = tokenizer(text, return_tensors="pt").input_ids
outputs = model_translation.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

In [67]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"Les lègumes partagent les ressources avec les bactéries fixant l'azote."

### Summarisation

In [70]:
model_summarisation_path = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_summarisation_path)
model_summarisation = AutoModelForSeq2SeqLM.from_pretrained(model_summarisation_path)

In [71]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [72]:
inputs = tokenizer(text, return_tensors="pt").input_ids

In [74]:
outputs = model_summarisation.generate(inputs, max_new_tokens=100, do_sample=False)
tokenizer.decode(outputs[0], skip_special_tokens=True)

"the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share."

### Multiple Choice 

In [78]:
model_multi_path = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_multi_path)
model_multi = AutoModelForMultipleChoice.from_pretrained(model_multi_path)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
candidate1 = "The law does not apply to croissants and brioche."
candidate2 = "The law applies to baguettes."

In [80]:
inputs = tokenizer([[prompt, candidate1],
                    [prompt, candidate2]], 
                   return_tensors="pt", padding=True)

labels = torch.tensor(0).unsqueeze(0)

In [81]:
outputs = model_multi(**{k: v.unsqueeze(0) for k, v in inputs.items()},
                      labels=labels)
logits = outputs.logits
predicted_class = logits.argmax().item()
predicted_class

1