In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from mingpt.model import GPT
from mingpt.utils import set_seed
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
set_seed(3407)

In [2]:
# Load the dataset
dataset = load_dataset("togethercomputer/RedPajama-Data-1T-Sample", 'plain_text', cache_dir='datasets')
dataset = dataset['train']

In [3]:
# Custom dataset class for the Red Pajama dataset
class RedPajamaDataset(Dataset):
    def __init__(self, data, max_length=1024):
        self.data = data
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenizer.pad_token_id = 50256
        self.max_length = max_length
        self.vocab_size = self.tokenizer.vocab_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']
        # Tokenize the text
        tokens = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.max_length, truncation=True, return_tensors='pt', padding=True)
        # Split the tokens into chunks of max_length
        # Shift the tokens to get targets (excluding the [CLS] token)
        target_tokens = tokens[:, 1:].clone()  # Exclude the [CLS] token
        tokens = tokens[:, :-1]  # Exclude the last token to match the shifted targets
        
        return tokens, target_tokens

# Create an instance of the custom dataset
red_pajama_dataset = RedPajamaDataset(dataset)
x, y = red_pajama_dataset[0]
print(x.shape, y.shape)

torch.Size([1, 1023]) torch.Size([1, 1023])


In [4]:
# create a GPT instance
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = red_pajama_dataset.vocab_size
model_config.block_size = red_pajama_dataset.max_length - 1
model_config.checkpoint = None
model = GPT(model_config)

number of parameters: 2.55M


In [9]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 1000 + model.iter_num if model_config.checkpoint else 1000  # This is a change
train_config.num_workers = 0
train_config.checkpoint_iters = 100     # This is a change
train_config.batch_size = 1
trainer = Trainer(train_config, model, red_pajama_dataset)

running on device cpu


In [10]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 9.32173
iter_dt 370.10ms; iter 100: train loss 8.45391
iter_dt 332.86ms; iter 200: train loss 7.82597
iter_dt 746.28ms; iter 300: train loss 7.67600
iter_dt 740.00ms; iter 400: train loss 7.70842


KeyboardInterrupt: 