# transformers: Autoregressive GPT-like LMs

In [None]:
import torch
from transformers import (
    set_seed,
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM # autoregressive models (GPT-like, decoder-only)
)

In [None]:
# set random seed manually
set_seed(123)

## Load model

In [None]:
# set model name
model_name = 'microsoft/DialoGPT-medium'
# model_name = 'distilbert/distilgpt2'

In [None]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# set padding (for batched inputs)
tokenizer.padding_side = 'left' # use left padding (not default)
tokenizer.pad_token = tokenizer.eos_token # set pad token

print(tokenizer)

In [None]:
# load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='cpu',
    torch_dtype=torch.bfloat16 # use brain floating point format
    # load_in_4bit=True # use quantization technique
)
model = model.eval()

print('Model device: {}'.format(model.device))
print('Model dtype: {}'.format(model.dtype))
print('Memory footprint: {:.2f} GiB'.format(model.get_memory_footprint() * 1e-9))

print(model)

In [None]:
# load pipeline (tokenizer and model)
pipe = pipeline(
    'text-generation',
    model=model_name,
    device_map='cpu',
    torch_dtype=torch.bfloat16
)

# use left padding for batched inputs with different lengths
pipe.tokenizer.padding_side = 'left'

print('Model device: {}'.format(pipe.device))
print('Model dtype: {}'.format(pipe.model.dtype))
print('Memory footprint: {:.2f} GiB'.format(pipe.model.get_memory_footprint() * 1e-9))

## Run pipeline

In [None]:
# initialize chat
chat = [
    {
        'role': 'system',
        'content': 'You are a chatbot equipped with nearly endless wisdom.'
    },
    {
        'role': 'user',
        'content': 'Hey, can you tell me any fun things to do in Munich?'
    }
]

# run pipeline
response = pipe(
    chat,
    max_new_tokens=512, # set max. number of newly generated tokens
    do_sample=True, # turn on sampling
    temperature=1.0, # set softmax temperature
    pad_token_id=tokenizer.eos_token_id # set pad token
)

# note that the generated response contains the entire chat
for chat_item in response[0]['generated_text']:
    print(chat_item)

In [None]:
# append chat
chat = response[0]['generated_text']

chat.append(
    {'role': 'user', 'content': 'Wait, what?'}
)

# run pipeline
response = pipe(
    chat,
    max_new_tokens=512,
    do_sample=True,
    temperature=1.0,
    pad_token_id=tokenizer.eos_token_id
)

for chat_item in response[0]['generated_text']:
    print(chat_item)

## Run model

In [None]:
# create prompts
prompt = [
    'How does it feel to be a chatbot?',
    'A list of colors: red, blue'
]

# tokenize
model_input = tokenizer(
    prompt,
    add_special_tokens=True, # add special tokens at beginning/end of sentence (not supported by all tokenizers)
    padding=True, # turn on padding (for batched inputs)
    return_tensors='pt'
)

# transfer to device
# model_input = model_input.to(model.device)
# model_input = {key: tensor.to(model.device) for key, tensor in model_input.items()}

print(model_input)

In [None]:
# print tokens
for input_ids in model_input['input_ids']:
    print(tokenizer.convert_ids_to_tokens(input_ids))

In [None]:
# generate response
gen_ids = model.generate(
    **model_input,
    max_new_tokens=100, # set max. number of newly generated tokens
    do_sample=True, # turn on sampling
    temperature=1.0, # set softmax temperature
    pad_token_id=tokenizer.eos_token_id # set pad token
)

# decode
# gen = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
gen = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)

# note that the generated output contains the input
print(gen_ids)
print(gen)

In [None]:
# extract features
transformer_out = model.transformer(**model_input)
last_hidden_state = transformer_out.last_hidden_state # (batch, sequence, features)

print(f'Features shape: {last_hidden_state.shape}')

In [None]:
# predict logits
model_out = model(**model_input)
logits = model_out.logits # (batch, sequence, tokens)

print(f'Logits shape: {logits.shape}')