In [12]:
import os
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import pipeline
from datasets import load_dataset
from dataclasses import dataclass

import warnings
warnings.filterwarnings("ignore")

In [2]:
@dataclass
class GPT2Config:
    model_name: str = "gpt2"
    input_file: str = "/kaggle/input/the-bards-best-a-character-modeling-dataset/train.csv"
    block_size: int = 128
    batch_size: int = 32
    num_epochs: int = 5000
    save_steps: int = 1000
    logging_steps: int = 500
    output_dir: str = "./shakespeare-gpt2"
    logging_dir: str = "./logs"
    fp16: bool = torch.cuda.is_available()

In [3]:
config = GPT2Config()

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token  

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [5]:
dataset = load_dataset("csv", data_files={"train": config.input_file})

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
def tokenize(data):
    tokens = tokenizer(
        data["text"],
        truncation=True,
        padding="max_length",
        max_length=config.block_size,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize, remove_columns=["text"])

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [7]:
model = GPT2LMHeadModel.from_pretrained(config.model_name)
model.config.pad_token_id = model.config.eos_token_id 

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
training_args = TrainingArguments(
    output_dir=config.output_dir,
    overwrite_output_dir=True,
    num_train_epochs=config.num_epochs,
    per_device_train_batch_size=config.batch_size,
    save_steps=config.save_steps,
    save_total_limit=2,
    logging_dir=config.logging_dir,
    logging_steps=config.logging_steps,
    prediction_loss_only=True,
    report_to="none", 
    fp16=config.fp16,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=collator,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,0.1075
1000,0.0024
1500,0.002
2000,0.0006
2500,0.0002
3000,0.0004
3500,0.0002
4000,0.0003
4500,0.0002
5000,0.0


TrainOutput(global_step=5000, training_loss=0.011380765021778642, metrics={'train_runtime': 317.4139, 'train_samples_per_second': 15.752, 'train_steps_per_second': 15.752, 'total_flos': 326615040000000.0, 'train_loss': 0.011380765021778642, 'epoch': 5000.0})

In [9]:
model.save_pretrained(config.output_dir)
tokenizer.save_pretrained(config.output_dir)

print(f"Model saved to: {config.output_dir}")

Model saved to: ./shakespeare-gpt2


In [16]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

print(generator("The King has returned ", max_length=150)[0]["generated_text"])

Device set to use cuda:0


The King has returned ****************

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?


