In [1]:
import numpy as np
import torch
import torchvision
import torch.nn as nn
import tqdm

In [2]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-05-16 03:26:57--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-05-16 03:26:57 (21.2 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
import sentencepiece as spm

input_file = 'input.txt'
model_prefix = 'tokenizer'
vocab_size = 256

# Train model
spm.SentencePieceTrainer.train(f'--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --user_defined_symbols=<n>')

In [5]:
sp = spm.SentencePieceProcessor()
sp.load('tokenizer.model')

special_token = '<n>' #Retain new lines
text_new = text.replace('\n', special_token)

encoded_text = sp.encode_as_ids(text_new)

In [None]:
print(sp.decode_ids(encoded_text).replace(special_token, '\n')[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [6]:
context_length = 64
train_test_percent = .9
batch_size = 32
train_data = encoded_text[:int(train_test_percent * len(encoded_text))]
test_data = encoded_text[int(train_test_percent * len(encoded_text)):]
eval_steps = 200

torch.manual_seed(20810581374109)

d_model = 256
dropout_percent = 0.2
num_heads = 8
num_layers = 6
device = 'cuda' if torch.cuda.is_available() else 'cpu'
learning_rate = 1e-4
training_steps = 5000


In [7]:


class FeedForward(nn.Module):
  #Feedfoward consisting of two linear with a relu inbetween as described in the paper

  def __init__(self):
    super().__init__()
    self.ffn = nn.Sequential(
        #We have bias in these following the paper
        nn.Linear(d_model, 4*d_model, bias=True), #4x as used in the paper,
        nn.ReLU(),
        nn.Linear(4*d_model, d_model, bias=True),
        nn.Dropout(dropout_percent)
    )

  def forward(self, x):
    return self.ffn(x)

class ScaledDotProductAttention(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(d_model, head_size, bias=False)
    self.query = nn.Linear(d_model, head_size, bias=False)
    self.value = nn.Linear(d_model, head_size, bias=False)
    self.dropout = nn.Dropout(dropout_percent)

  def forward(self, x):
    q = self.query(x)
    k = self.key(x)
    v = self.value(x)

    temp = q @ k.transpose(-2,-1) * (d_model ** -.5)

    mask =  torch.tril(torch.ones(context_length, context_length)).to(device)
    #We need to mask since it's decoder only transformer and we don't have future context
    temp = temp.masked_fill(mask[:x.shape[1], :x.shape[1]] == 0, float('-inf'))
    temp = torch.nn.functional.softmax(temp, dim = -1)
    temp = self.dropout(temp)
    out = temp @ v
    return out

class MultiheadedAttention(nn.Module):

  #MHA based off the paper

  def __init__(self):
    super().__init__()
    head_size = d_model // num_heads
    self.w_O = nn.Linear(d_model, d_model, bias=False)
    self.dropout = nn.Dropout(dropout_percent)
    self.multihead = nn.ModuleList([ScaledDotProductAttention(head_size) for _ in range(num_heads)])
  def forward(self, x):
    tensors = [attention(x) for attention in self.multihead]

    out = self.dropout(self.w_O(torch.cat(tensors, dim = -1)))
    return out


class TransformerLayer(nn.Module):
  #Decoder Only Transformer Layer
  #Will take input from the embedding throgh a SA layer and then a FF layer

  def __init__(self):
    super().__init__()
    #Multiheaded attention

    self.mha = MultiheadedAttention()
    self.ff = FeedForward()
    self.sa_norm = nn.LayerNorm(d_model)
    self.ff_norm = nn.LayerNorm(d_model)

  def forward(self, x):
    out = x + self.mha(self.sa_norm(x))
    out = out + self.ff(self.ff_norm(out))
    return out

class Transformer(nn.Module):
  #Transformer
  #Will handle the input embeddings and creating the transformer layers

  def __init__(self):
    super().__init__()
    self.tok = nn.Embedding(vocab_size, d_model)
    self.pos = nn.Embedding(context_length, d_model)
    self.final_linear = nn.Linear(d_model, vocab_size) #Final linear layer that converts to output tokens
    self.final_norm = nn.LayerNorm(d_model) #Final normalization before final_linear gets called
    self.layers = nn.Sequential(*[TransformerLayer() for _ in range(num_layers)]) #unpacks the list

  def forward(self, input, targets=None):
    tok = self.tok(input)
    pos = self.pos(torch.arange(input.shape[1], device=device))
    new_input = tok + pos
    new_input = self.layers(new_input)
    new_input = self.final_norm(new_input)
    new_input = self.final_linear(new_input)
    if (targets is not None):
      b,t,c = new_input.shape
      logits = new_input.view(b*t, c)
      targets = targets.view(b*t)
      loss = nn.functional.cross_entropy(logits, targets)
    else:
      loss = None

    return new_input, loss

@torch.no_grad()
def getloss():
  model.eval()
  out = {}

  for split in [True, False]:
    losses = []
    for _ in range(200):
      eval_x, eval_y = create_batch(training=split)
      _, loss = model(eval_x, eval_y)
      losses.append(loss.item())
    if (split):
      arr = np.array(losses)
      out['train'] = np.mean(arr)
    else:
      out['val'] = np.mean(np.array(losses))
  model.train()
  return out

def create_batch(training=True):
  data = train_data if training else train_data
  #Generate from index 0 to len(data)-context
  #as we will take index:index+context_length+1 sized data
  ix = torch.randint(len(data) - context_length, (batch_size,))
  x = []
  y = []
  for i in ix:
    x.append(data[i:i+context_length])
    y.append(data[i+1:i+context_length+1])
  x = torch.tensor(x)
  y = torch.tensor(y)
  x = x.to(device)
  y = y.to(device)
  return x,y


model = Transformer()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(training_steps):
  if (iter % 100 == 0) or iter == training_steps - 1:
    losses = getloss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    # sample a batch of data
  x, y = create_batch(training=True)

  # evaluate the loss
  logits, loss = model(x, y)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()


#context = torch.zeros((1, 1), dtype=torch.long, device=device)
#print(sp.decode_ids((m.generate(context, max_new_tokens=100)[0].tolist())).replace(special_token, '\n'))




4.88064 M parameters
step 0: train loss 5.7033, val loss 5.7028
step 100: train loss 4.2330, val loss 4.2375
step 200: train loss 3.8143, val loss 3.8171
step 300: train loss 3.6535, val loss 3.6512
step 400: train loss 3.5487, val loss 3.5458
step 500: train loss 3.4884, val loss 3.4882
step 600: train loss 3.4372, val loss 3.4248
step 700: train loss 3.3790, val loss 3.3874
step 800: train loss 3.3441, val loss 3.3455
step 900: train loss 3.3081, val loss 3.3099
step 1000: train loss 3.2757, val loss 3.2796
step 1100: train loss 3.2432, val loss 3.2440
step 1200: train loss 3.2210, val loss 3.2181
step 1300: train loss 3.1807, val loss 3.1863
step 1400: train loss 3.1498, val loss 3.1565
step 1500: train loss 3.1206, val loss 3.1172
step 1600: train loss 3.0860, val loss 3.0852
step 1700: train loss 3.0476, val loss 3.0540
step 1800: train loss 3.0113, val loss 3.0191
step 1900: train loss 2.9885, val loss 2.9826
step 2000: train loss 2.9492, val loss 2.9520
step 2100: train loss 2.9

In [8]:
def generate_text(model, context, max_new_tokens):
  model.eval()
  for i in range(max_new_tokens):
    contextge = context[:, -context_length:]
    logits, loss = model(contextge, None)
    prob_dist = nn.functional.softmax(logits, dim=-1)
    sample = torch.multinomial(prob_dist[:,-1,:], num_samples=1)
    context = torch.cat((context, sample), dim=1)
  model.train()
  return context

context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(sp.decode_ids(generate_text(m, context, max_new_tokens=500)[0].tolist()).replace(special_token, '\n'))


 ⁇ ET:
Where, my lord: if you do, no lose:
And I take ofk or emselved bushay. usive thy baps,
And kind and my came-forrow and me seen speedful
Thou may deed
Of what had to speak with me: in our need,
And high we hear, I'll tail we, speak not whiss,
FelfooH's good nature's stale.

LUCIO:
My Lordio, why, then we speedied,
There violer wars in vircaster us;
And to an another.
Why are thee to begue, the wins stander? O,
Have graced hot lives you,
So is so clix with queen times hus with me kill
As conduaty'd greatends let?
We speak joyal 'tign aught joy'd,
Yet, with meet he shall sade for your ends:
He was I will so tears that do has it well?
If is it set that were no out?

BENVOLIO:
Salk death, is Camillio: thou word, he shall have is,
Should foolve by his hence against Sent.

Bengger nelds! Chen too, this sent them it son,
Nake tellign of this


In [49]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)

len(generate_text(m, context, max_new_tokens=300)[0].tolist())

65