# Simple pipeline

In [None]:
from transformers import pipeline

text_generator = pipeline(model="gpt2", framework="pt", device=0)

print(text_generator("The", max_length=50, do_sample=True))

# Simple loading and using a pre-trained model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2", clean_up_tokenization_spaces=True)
model = AutoModelForCausalLM.from_pretrained("gpt2")

input_text = "The"
input_tokens = tokenizer(input_text, return_tensors="pt")

output_tokens = model.generate(**input_tokens, max_length=50, do_sample=True)
output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

print(output_text)

# Evaluate a model on a dataset

In [None]:
import evaluate
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2", clean_up_tokenization_spaces=True)
model = AutoModelForCausalLM.from_pretrained("gpt2")

input_text = "The"
input_tokens = tokenizer(input_text, return_tensors="pt")

output_tokens = model.generate(**input_tokens, max_length=50, do_sample=True)
output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

evaluator = evaluate.load("perplexity")
evaluator.compute(predictions=[output_text], model_id="gpt2")


# Train a model with transformers' Trainer
Unvalidated. Follow https://huggingface.co/docs/transformers/en/training#evaluate or https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt#initializing-a-new-model

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import evaluate

dataset = load_dataset("yelp_review_full")
# DatasetDict({
#     'train': Dataset({
#         features: ['label', 'text'],
#         ...
#     }),
#     'test': Dataset({
#         features: ['label', 'text'],
#         ...
#     }),
#     ...
# })

tokenizer = AutoTokenizer.from_pretrained("gpt2", clean_up_tokenization_spaces=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")

tokenized_dataset = dataset.map(tokenize_function, batched=True) #type: DatasetDict

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # No Masked Language Modeling



metric = evaluate.load("perplexity")

def compute_metrics(eval_pred):
    predictions, _ = eval_pred
    return metric.compute(predictions=predictions, model_id="gpt2")



model = AutoModelForCausalLM.from_pretrained("gpt2")

training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

