# Test BLOOM
September 2022

Test using BLOOM model through huggingface.

In [7]:
from transformers import (
    BloomModel,
    BloomConfig, 
    BloomTokenizerFast,
    BloomForCausalLM,
    BloomForSequenceClassification,
    BloomForTokenClassification
)
import torch 

In [2]:
# Initializing a Bloom configuration
configuration = BloomConfig()

# Initializing a model from the configuration
model = BloomModel(configuration)

# Accessing the model configuration
configuration = model.config

In [3]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-350m")
model = BloomModel.from_pretrained("bigscience/bloom-350m")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

Downloading tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/13.8M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

In [4]:
last_hidden_states

tensor([[[ 1.9988e+00,  2.1394e+00, -3.3615e+00,  ..., -1.0710e+03,
          -2.3329e+00,  6.6761e+00],
         [-2.7488e+00, -1.3973e+00, -3.2516e+00,  ..., -9.0800e+02,
          -1.8817e-01,  2.5218e+00],
         [ 5.5479e-01, -2.2881e+00, -4.1895e-01,  ..., -9.4320e+02,
           2.3407e+00,  5.4891e+00],
         [ 1.3067e+00, -3.3615e+00, -1.9271e+00,  ..., -1.0148e+03,
           3.0082e+00,  6.0203e+00],
         [-2.3621e+00, -6.1651e+00, -1.9696e+00,  ..., -9.6746e+02,
           2.2044e+00,  4.4250e+00],
         [ 1.9945e+00, -6.8697e+00,  8.5640e-01,  ..., -9.9383e+02,
           1.5745e+00,  8.3419e+00]]], grad_fn=<ViewBackward0>)

# BLoomForCausalLM

In [25]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-350m")
model = BloomForCausalLM.from_pretrained("bigscience/bloom-350m")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
logits = outputs.logits
print(tokenizer.decode(outputs[0]))

TypeError: Can't convert 4.195248126983643 to Sequence

In [9]:
logits

tensor([[[346.8376, 349.7458, 361.8978,  ..., 204.2956, 204.2954, 204.2904],
         [333.0381, 333.2141, 343.7316,  ..., 197.0953, 197.0948, 197.0892],
         [401.7427, 404.6340, 417.7802,  ..., 208.0413, 208.0413, 208.0355],
         [402.8060, 404.7499, 423.0336,  ..., 207.9611, 207.9613, 207.9560],
         [408.2845, 405.2424, 424.2576,  ..., 206.7770, 206.7772, 206.7714],
         [410.1431, 410.2028, 430.8823,  ..., 208.3094, 208.3098, 208.3039]]],
       grad_fn=<UnsafeViewBackward0>)

In [33]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-350m")
model = BloomForCausalLM.from_pretrained("bigscience/bloom-350m")

prompt = "The capital of Illinois is"
input_token_ids = tokenizer(prompt, return_tensors='pt')
output = model.generate(**input_token_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))

The capital of Illinois is located in the city of Chicago. The capital of Illinois is located in the city of Chicago. The


In [34]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-350m")
model = BloomForCausalLM.from_pretrained("bigscience/bloom-350m")

prompt = "I love my dog because"
input_token_ids = tokenizer(prompt, return_tensors='pt')
output = model.generate(**input_token_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))

I love my dog because he is my best friend. He is my best friend because he is my best friend. He is


In [18]:
output[0]

tensor([    39, 216251,   1306,    368,   7733,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0])

# BloomForSequenceClassification

In [11]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-350m")
model = BloomForSequenceClassification.from_pretrained("bigscience/bloom-350m")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]


Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'LABEL_0'

In [12]:
logits

tensor([[93.1177, 92.4969]])

# BloomForTokenClassification

In [13]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-350m")
model = BloomForTokenClassification.from_pretrained("bigscience/bloom-350m")

inputs = tokenizer(
    "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
)

with torch.no_grad():
    logits = model(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

# Note that tokens are classified rather then input words which means that
# there might be more predicted token classes than words.
# Multiple token classes might account for the same word
predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
predicted_tokens_classes


Some weights of BloomForTokenClassification were not initialized from the model checkpoint at bigscience/bloom-350m and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0']

In [14]:
len(predicted_tokens_classes)

12