## GPT2 Model Tuning

### Get and pre-preocess dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("shayharding/reuters-articles")
dataset

In [None]:
def create_full_article_col(example):
    return {'full_article': f"TITLE:{example['title']}\n\nBODY:{example['body']}" }

dataset = dataset.map(create_full_article_col)
print(dataset)
print(dataset['train'][0]['full_article'])

### Get and create the model

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("shayharding/gpt2-reuters-tokenizer")

In [None]:
CONTEXT_LENGTH = 512

def tokenize(element):
    return tokenizer(
        element["full_article"],
        truncation=True,
        max_length=CONTEXT_LENGTH,
        return_overflowing_tokens=False
    )

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset['train'].column_names)
tokenized_dataset

In [None]:
from transformers import GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=CONTEXT_LENGTH,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

### Train the model

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import TrainingArguments, Trainer
import wandb

HF_USER = "shayharding"
FT_MODEL = "gpt2-reuters-textgen"

training_args = TrainingArguments(
    output_dir="./" + FT_MODEL,
    hub_model_id=HF_USER + "/" + FT_MODEL,
    learning_rate=5e-4,
    num_train_epochs=2,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    auto_find_batch_size=True,
    fp16=True,
    eval_strategy="epoch",
    lr_scheduler_type="cosine",
    push_to_hub=True,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

wandb.init(project=FT_MODEL)

In [None]:
trainer.train()

### Use the model

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model=HF_USER + "/" + FT_MODEL)

In [None]:
sample = dataset['test'][2]
sample

In [None]:
prompt = f"TITLE:{sample['title']}\n\nBODY:"
pipe(prompt, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id)

In [None]:
prompt = f"TITLE:{sample['title']}"
pipe(prompt, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id)

### Keep training the model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
import wandb

HF_USER = "shayharding"
FT_MODEL = "gpt2-reuters-textgen"

model = AutoModelForCausalLM.from_pretrained(HF_USER + "/" + FT_MODEL)
tokenizer = AutoTokenizer.from_pretrained(HF_USER + "/" + FT_MODEL)

training_args = TrainingArguments(
    output_dir="./" + FT_MODEL,
    hub_model_id=HF_USER + "/" + FT_MODEL,
    learning_rate=5e-4,
    num_train_epochs=2,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    auto_find_batch_size=True,
    fp16=True,
    eval_strategy="epoch",
    lr_scheduler_type="cosine",
    push_to_hub=True,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

wandb.init(project=FT_MODEL)

In [None]:
trainer.train()