In [1]:
!pip install -q transformers torch

In [2]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, \
    GPT2Tokenizer

In [3]:
def load_dataset(train_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

    return train_dataset, data_collator

In [4]:
def train(model, dataset, data_collator):
    training_args = TrainingArguments(
        output_dir="./gpt2-viking",
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,

    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
train_path = '/content/ragnar_dialogues.txt'
train_dataset, data_collator = load_dataset(train_path, tokenizer)



In [7]:
!pip install -q accelerate -U

In [8]:
train(model, train_dataset, data_collator)

Step,Training Loss


In [9]:
model.save_pretrained("./gpt2-viking")
tokenizer.save_pretrained("./gpt2-viking")

('./gpt2-viking/tokenizer_config.json',
 './gpt2-viking/special_tokens_map.json',
 './gpt2-viking/vocab.json',
 './gpt2-viking/merges.txt',
 './gpt2-viking/added_tokens.json')

In [10]:
!zip -r gpt2-viking.zip gpt2-viking/

  adding: gpt2-viking/ (stored 0%)
  adding: gpt2-viking/merges.txt (deflated 53%)
  adding: gpt2-viking/special_tokens_map.json (deflated 74%)
  adding: gpt2-viking/config.json (deflated 51%)
  adding: gpt2-viking/vocab.json (deflated 68%)
  adding: gpt2-viking/generation_config.json (deflated 24%)
  adding: gpt2-viking/runs/ (stored 0%)
  adding: gpt2-viking/runs/Feb29_20-10-13_9a0f451559d5/ (stored 0%)
  adding: gpt2-viking/runs/Feb29_20-10-13_9a0f451559d5/events.out.tfevents.1709237414.9a0f451559d5.1323.0 (deflated 59%)
  adding: gpt2-viking/tokenizer_config.json (deflated 54%)
  adding: gpt2-viking/model.safetensors (deflated 7%)
