# Introduction

This notebook is designed to test the hypothesis that the order of data can significantly impact the speed at which a neural network training converges. 

We will be using a simple task of adding integers for this experiment. This task is chosen as it will take less time to train the model.

We will experiment with various data orderings. These include multiple random orderings, and datapoints ranked by GPT 3.5 with respect to the perceived difficulty level of the datapoint. 

In addition, we will also explore a scenario where we construct a hypothetical skill tree required to achieve a low loss on the dataset. The datapoints will then be sorted according to a topological sort of the skills involved. 

Let's begin by importing the necessary libraries and loading the GPT2 model.

In [6]:
from transformers import GPT2Model
from torchtyping import TensorType
import torch
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch import nn, optim


model = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Example usage:
input_tensor = torch.randint(0, 1000, (1, 10))  # Random tensor for testing. Dimensions represent [batch_size, sequence_length]
output = model(input_tensor)  # Output tensor from the GPT2 model. Dimensions represent [batch_size, sequence_length, hidden_state]
token_ids = input_tensor.tolist()
# Decode the token ids to tokens using batch decode
decoded_output = tokenizer.batch_decode(token_ids, skip_special_tokens=True)
print(decoded_output)

[' gamears su 0 years saithper��']


In [11]:

# Define a custom dataset for integer addition
class AdditionDataset(Dataset):
    def __init__(self, num_samples: int, max_integer: int):
        self.num_samples = num_samples
        self.max_integer = max_integer

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        a = torch.randint(0, self.max_integer, (1,))
        b = torch.randint(0, self.max_integer, (1,))
        sum_ab = a + b
        return a, b, sum_ab

# Create a dataloader
num_samples = 10000
max_integer = 1000
batch_size = 2
addition_dataset = AdditionDataset(num_samples, max_integer)
addition_dataloader = DataLoader(addition_dataset, batch_size=batch_size, shuffle=True)
# Test the dataloader
data_iter = iter(addition_dataloader)
data = next(data_iter)
print(f"Input a: {data[0]}")
print(f"Input b: {data[1]}")
print(f"Sum: {data[2]}")


Input a: tensor([[321],
        [571]])
Input b: tensor([[654],
        [429]])
Sum: tensor([[ 975],
        [1000]])


In [24]:
model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [31]:
model.wte.weight.shape

torch.Size([50257, 768])

In [52]:
vocab = tokenizer.get_vocab()
atomic_tokens = [token for token in vocab if token.isdigit()]
print(f"Atomic tokens: {atomic_tokens}")


Atomic tokens: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '²', '³', '¹', '00', '01', '000', '10', '12', '50', '19', '11', '20', '201', '30', '15', '14', '16', '13', '25', '18', '17', '24', '80', '40', '22', '60', '23', '29', '27', '26', '28', '99', '33', '70', '200', '45', '35', '0000', '64', '75', '21', '38', '44', '36', '32', '39', '34', '05', '37', '48', '66', '55', '47', '08', '49', '09', '65', '07', '02', '04', '100', '03', '68', '31', '67', '59', '06', '77', '58', '69', '88', '46', '57', '43', '42', '78', '79', '90', '95', '41', '56', '54', '500', '98', '76', '52', '53', '51', '86', '74', '89', '2015', '72', '73', '96', '71', '2014', '63', '62', '2016', '85', '61', '2017', '97', '84', '87', '94', '92', '83', '93', '300', '2013', '91', '82', '81', '2012', '400', '800', '2018', '600', '00000000', '001', '150', '101', '250', '2011', '700', '123', '120', '2010', '2009', '000000', '2000', '003', '110', '2008', '125', '256', '429', '2007', '128', '1000', '900', '130', '2006', '

In [50]:
input_tensor = torch.tensor([[10,12,13],[1,4,2]])
print(model.wte.weight.T.shape) # torch.Size([768, 50257])
output = model(input_tensor)
print(output.last_hidden_state[:,-1,:].shape)  # torch.Size([2, 768])
result = model.wte.weight @ output.last_hidden_state[:,-1,:].T
print(result.shape)


torch.Size([768, 50257])
torch.Size([2, 768])
torch.Size([50257, 2])


In [53]:
# Define the model
class SumPredictor(nn.Module):
    def __init__(self, gpt2_model):
        super().__init__()
        self.gpt2_model = gpt2_model

    def forward(self, input_tensor: TensorType["batch", "sequence"]):
        output = self.gpt2_model(input_tensor)
        return self.gpt2_model.wte.weight @ output.last_hidden_state[:, -1, :].T


# Actually let's use Llama for this task because of the tokenization 

## Define the loss function
#loss_fn = nn.MSELoss()
#
## Define the optimizer
#optimizer = optim.Adam(model.parameters())
#
## Define the training function
#def train(model, dataloader, loss_fn, optimizer, num_epochs):
#    model.train()
#    losses = []
#    for epoch in tqdm(range(num_epochs)):
#        epoch_loss = 0
#        for a, b, sum_ab in dataloader:
#            optimizer.zero_grad()
#            input_tensor = torch.cat([a, b], dim=1)
#            output = model(input_tensor)
#            loss = loss_fn(output, sum_ab)
#            loss.backward()
#            optimizer.step()
#            epoch_loss += loss.item()
#        losses.append(epoch_loss / len(dataloader))
#    return losses
#
## Train the model
#num_epochs = 10
#losses = train(model, addition_dataloader, loss_fn, optimizer, num_epochs)
#
## Plot the loss curve
#plt.plot(losses)
#plt.xlabel('Epoch')
#plt.ylabel('Loss')
#plt.title('Loss curve')
#plt.show()
#

  0%|          | 0/10 [00:00<?, ?it/s]


AttributeError: 'BaseModelOutputWithPastAndCrossAttentions' object has no attribute 'size'