## Import necessary libraries

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

## Load a  model and tokenizer

In [2]:
model_name = "distilgpt2"  #  Small gpt-2 model

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("\nTokenizer vocabulary size:", len(tokenizer))

# Load model
model = AutoModelForCausalLM.from_pretrained(model_name)
print("\nModel architecture:")
print(model.__class__.__name__)


Tokenizer vocabulary size: 50257

Model architecture:
GPT2LMHeadModel


Tokenization Example

In [3]:
# Example text
text = "Hello, I am learning NLP!"

# Tokenize the text
tokens = tokenizer(text, return_tensors="pt")
print("\nTokenized output:")
print(tokens)

# Decode tokens back to text
decoded_text = tokenizer.decode(tokens["input_ids"][0])
print("\nDecoded text:", decoded_text)



Tokenized output:
{'input_ids': tensor([[15496,    11,   314,   716,  4673,   399, 19930,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

Decoded text: Hello, I am learning NLP!


Look at individual tokens

In [4]:
token_ids = tokens["input_ids"][0].tolist()
print("\nIndividual tokens:")
for token_id in token_ids:
    print(f"ID: {token_id}, Token: {tokenizer.decode([token_id])}")


Individual tokens:
ID: 15496, Token: Hello
ID: 11, Token: ,
ID: 314, Token:  I
ID: 716, Token:  am
ID: 4673, Token:  learning
ID: 399, Token:  N
ID: 19930, Token: LP
ID: 0, Token: !


Forward pass and generation


In [5]:
# Forward pass
with torch.no_grad():
    outputs = model(**tokens)
    print("\nOutput logits shape:", outputs.logits.shape)


Output logits shape: torch.Size([1, 8, 50257])


Text generation input

In [6]:
input_text = "The future of AI is"
inputs = tokenizer(input_text, return_tensors="pt")

generated_ids = model.generate(
    inputs["input_ids"],
    max_length=50,
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [7]:
# Greedy decoding
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(f"\nGenerated text:\n{generated_text}")




Generated text:
The future of AI is not yet clear.











































We will dive deep into more features in huggingface (training) in the following weeks!