# transformers: Bidirectional BERT-like LMs

In [None]:
import torch
from transformers import (
    set_seed,
    AutoTokenizer,
    AutoModel, # feature extractors (without task-specific head)
    AutoModelForSequenceClassification # sequence classification (with final head)
    # AutoModelForMaskedLM # bidirectional models (BERT-like, encoder-only)
)

In [None]:
# set random seed
set_seed(123)

## Load base model

In [None]:
# set model name
base_model_name = 'distilbert/distilbert-base-uncased'

In [None]:
# set device
device_map = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
# create tokenizer
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)

print(base_tokenizer)

In [None]:
# load model
base_model = AutoModel.from_pretrained(
    base_model_name,
    device_map=device_map,
    torch_dtype=torch.bfloat16 # use brain floating point format
)

base_model = base_model.eval()

print('Model device: {}'.format(base_model.device))
print('Model dtype: {}'.format(base_model.dtype))
print('Memory footprint: {:.2f} GiB'.format(base_model.get_memory_footprint() * 1e-9))

print(base_model)

## Compute embeddings

In [None]:
# create raw input
raw_input = [
    'A rabbit goes into the supermarket.',
    'One, two, three, four, five.'
]

# tokenize
model_input = base_tokenizer(
    raw_input,
    add_special_tokens=True, # add CLS token at the beginning
    padding=True, # turn on padding (for batched inputs)
    truncation=True, # turn on truncation (to max. length)
    return_tensors='pt' # return PyTorch tensor
)

# transfer to device
model_input = model_input.to(base_model.device)

print(model_input)

In [None]:
# print tokens
for input_ids in model_input['input_ids']:
    print(base_tokenizer.convert_ids_to_tokens(input_ids))

In [None]:
# compute embeddings
base_out = base_model(**model_input)
last_hidden_state = base_out.last_hidden_state # (batch, sequence, features)

print(f'Embeddings shape: {last_hidden_state.shape}')

## Load classifier

In [None]:
# set model name
classif_model_name = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'

In [None]:
# create tokenizer
classif_tokenizer = AutoTokenizer.from_pretrained(classif_model_name)

print(classif_tokenizer)

In [None]:
# load model
classif_model = AutoModelForSequenceClassification.from_pretrained(
    classif_model_name,
    device_map=device_map,
    torch_dtype=torch.bfloat16 # use brain floating point format
)

classif_model = classif_model.eval()

print('Model device: {}'.format(classif_model.device))
print('Model dtype: {}'.format(classif_model.dtype))
print('Memory footprint: {:.2f} GiB'.format(classif_model.get_memory_footprint() * 1e-9))

print(classif_model)

## Classify sequences

In [None]:
# create raw input
raw_input = [
    'This is great.',
    'This is awful.'
]

# tokenize
model_input = classif_tokenizer(
    raw_input,
    add_special_tokens=True, # add CLS token at the beginning
    padding=True, # turn on padding (for batched inputs)
    truncation=True, # turn on truncation (to max. length)
    return_tensors='pt' # return PyTorch tensor
)

# transfer to device
model_input = model_input.to(classif_model.device)

print(model_input)

In [None]:
# print tokens
for input_ids in model_input['input_ids']:
    print(classif_tokenizer.convert_ids_to_tokens(input_ids))

In [None]:
# predict logits
classif_out = classif_model(**model_input)
logits = classif_out.logits # (batch, labels)

print(f'Logits shape: {logits.shape}')

In [None]:
# get predicted labels
label_ids = logits.argmax(dim=-1)
labels = [classif_model.config.id2label[lidx.item()] for lidx in label_ids]

print(labels)