<a href="https://colab.research.google.com/github/hackdavid/LLM-model-using-torch-from-scratch-Build-GPT-/blob/main/LLM_with_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
### Building a LLM model from scratch with bigram and transformer

**Building a GPT**

In [2]:
# get dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-11-13 19:22:51--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-11-13 19:22:51 (5.91 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
dataset_file_path = '/content/input.txt'

In [5]:
p = torch.randint(1003854 - 8, (4,))
p

tensor([633090, 146108, 498870, 947033])

In [6]:
class Dataset:
  '''
  this class contains each preproessing methods that will be used for
  nornal transformation like encoding ,decoding ,load_dataset etc
  and also hyperparamter also

  '''
  def __init__(self,file_path,batch_size=4,block_size=8):
    self.dataset_file_path = file_path
    self.vocab_size = None
    self.batch_size = batch_size
    self.block_size = block_size
    self.encoder_mapping = {}
    self.decoder_mapping = {}
  def load_dataset(self):
    with open(self.dataset_file_path,'r',encoding='utf-8') as f:
      text = f.read()
    # prepare the vocab_size
    chars_vocab = list(set(text))
    self.vocab_size = len(chars_vocab)
    self.encoder_mapping = {ch:i for i,ch in enumerate(chars_vocab)}
    self.decoder_mapping = {i:ch for i,ch in enumerate(chars_vocab)}
    # now convert the text to tensor using torch library
    data = self.encode(text)
    data = torch.tensor(data, dtype=torch.long)
    print(f'total vocab_size : {self.vocab_size} \n vocab set : {chars_vocab}')
    return data,self.vocab_size

  def split_dataset(self,data=[],ratio=0.9):
    # 90 % for traning and 10 % for valiation
    n = int(ratio*len(data))
    train = data[:n]
    valid = data[n+1:]
    return train,valid

  def get_data_batch(self,data):
    random_index = torch.randint(0,len(data)- self.block_size,(self.batch_size,))
    x = torch.stack([data[i:i+self.block_size] for i in random_index])
    y = torch.stack([data[i+1:i+self.block_size+1] for i in random_index])
    return x,y
  def encode(self,text):
    return [self.encoder_mapping.get(ch) for ch in text]
  def decode(self,text):
    return [self.decoder_mapping.get(ch) for ch in text]





In [7]:
dataset = Dataset(dataset_file_path)
data,vocab_size = dataset.load_dataset()

total vocab_size : 65 
 vocab set : ['B', '.', 'c', 'l', 'E', 'G', '!', 'j', 't', 'f', 'L', '?', ';', 's', 'O', 'Z', 'Q', 'J', 'C', 'q', ' ', 'U', 'T', '$', 'x', 'g', 'd', ',', 'I', 'M', 'D', 'u', 'e', 'Y', '3', 'y', ':', 'v', 'W', 'H', 'w', '\n', 'k', 'R', 'K', 'X', 'b', 'm', 'P', 'a', 'A', 'V', 'N', 'z', 'i', 'h', 'S', '-', 'n', "'", 'r', 'F', 'p', 'o', '&']


In [8]:
train_data,valid = dataset.split_dataset(data)

In [9]:
train_data

tensor([61, 54, 60,  ..., 32, 60, 32])

In [10]:
x_train,y_train = dataset.get_data_batch(train_data)
x_train,y_train


(tensor([[26, 32, 60, 13, 20, 63,  9, 20],
         [61, 43, 28, 50, 43, 20, 22, 39],
         [43, 28, 50, 43, 20, 10, 50, 21],
         [ 2,  3, 63,  2, 42, 27, 20,  8]]),
 tensor([[32, 60, 13, 20, 63,  9, 20, 54],
         [43, 28, 50, 43, 20, 22, 39, 14],
         [28, 50, 43, 20, 10, 50, 21, 43],
         [ 3, 63,  2, 42, 27, 20,  8, 55]]))

In [11]:
# just for visualization purpose otherwiase ignore this code
for b in range(4): # batch dimension
    for t in range(8): # time dimension
        context = x_train[b, :t+1]
        target = y_train[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

when input is [26] the target: 32
when input is [26, 32] the target: 60
when input is [26, 32, 60] the target: 13
when input is [26, 32, 60, 13] the target: 20
when input is [26, 32, 60, 13, 20] the target: 63
when input is [26, 32, 60, 13, 20, 63] the target: 9
when input is [26, 32, 60, 13, 20, 63, 9] the target: 20
when input is [26, 32, 60, 13, 20, 63, 9, 20] the target: 54
when input is [61] the target: 43
when input is [61, 43] the target: 28
when input is [61, 43, 28] the target: 50
when input is [61, 43, 28, 50] the target: 43
when input is [61, 43, 28, 50, 43] the target: 20
when input is [61, 43, 28, 50, 43, 20] the target: 22
when input is [61, 43, 28, 50, 43, 20, 22] the target: 39
when input is [61, 43, 28, 50, 43, 20, 22, 39] the target: 14
when input is [43] the target: 28
when input is [43, 28] the target: 50
when input is [43, 28, 50] the target: 43
when input is [43, 28, 50, 43] the target: 20
when input is [43, 28, 50, 43, 20] the target: 10
when input is [43, 28, 50

In [12]:
# till now our data_preprocessing is done

**Lets Build the bigrammodel so we can train your model with our data**

In [13]:
class BigramModel(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.vocab_size = vocab_size
    # there is formula to calculate the embedibg table which is mention on torch.nn.embeddig documentation so please go through this to understand the maths
    self.token_embedding_table = nn.Embedding(self.vocab_size,self.vocab_size)

  def forward(self,x_train,y_train=None):
    # shape of x_train = y_train = 4, 8
    prediction = self.token_embedding_table(x_train)
    # becuase at starting there may be not context so not previous information for first token
    if y_train is None:
      loss = None
    else:
      # prediction is a table having 3-dimention i.e (batch_size,block_size,vocab_size)
      # but the y_train shape is 4,8 which is not same as prediction so we have to
      # change the shape so we can calculate the loss using cross_entropy
      # view method is used to change the shape having same data ,it takes paramter as dimentation
      (d_batch_size,d_block_size,d_vocab_size) = prediction.shape
      prediction = prediction.view(d_batch_size*d_block_size,d_vocab_size) # new dimention will be 32,65 in current example
      y_train = y_train.view(d_batch_size*d_block_size)                     # new dimention will be 32 in current example
      loss = F.cross_entropy(prediction,y_train)
    return prediction,loss

  def generate(self,token_size=100,input=None):
    '''
    this function will generate the new token based on learning
    Note: first we will feed the random data at starting contest and generate the new token and concat to torch
    '''
    if input is None:
      input = torch.zeros((1,1),dtype=torch.long)
    for _ in range(token_size):
      prediction,loss = self(input)
      # focus only on the last time step
      prediction = prediction[:, -1, :] # becomes (d_batch_size, d_vocab_size)
      # apply softmax to get probabilities
      probs = F.softmax(prediction, dim=-1) # (d_batch_size, d_vocab_size)
      # sample from the distribution
      next_prediction = torch.multinomial(probs, num_samples=1) # (d_batch_size, 1)
      # append sampled index to the running sequence
      input = torch.cat((input, next_prediction), dim=1) # (d_batch_size, d_block_size+1)
    new_tokens = input[0]
    return new_tokens






In [14]:
model = BigramModel(vocab_size)

In [15]:
prediction,loss = model(x_train,y_train)

In [16]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [17]:
train_data

tensor([61, 54, 60,  ..., 32, 60, 32])

In [18]:
def train(epoch=1000):
  for step in range(epoch):
    x_train,y_train = dataset.get_data_batch(train_data)
    prediction,loss = model(x_train,y_train)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    # print(f'Epoch - {step} , loss - {loss} ')
  print(f'After Epoch - {epoch} , loss - {loss} ')

In [19]:
train(epoch=10000)

After Epoch - 10000 , loss - 2.651421546936035 


In [20]:
tokens = model.generate()

In [21]:
tokens

tensor([ 0,  4,  4, 30, 54, 49, 60, 49, 26, 20,  0, 53, 33,  3, 20,  2, 32,  6,
        41, 29, 54, 31, 58, 59,  3, 20, 55, 25, 20, 47, 49, 37, 54, 13, 13,  8,
        55, 32,  3,  9, 63, 11, 41, 22, 34, 12, 20, 63, 31, 27, 20, 25, 32, 20,
        62, 54, 63, 31,  8, 41, 41, 30, 49,  3, 36, 20, 58,  8, 55, 49,  8, 32,
        49, 37, 32, 26, 20,  9,  9, 32, 60, 26, 12, 25,  3, 49, 47, 14, 36, 27,
        20, 47, 49, 58, 20,  9, 20, 13,  8, 20, 55])

In [22]:
# now decode this token
print(dataset.decode(tokens.tolist()))

['B', 'E', 'E', 'D', 'i', 'a', 'r', 'a', 'd', ' ', 'B', 'z', 'Y', 'l', ' ', 'c', 'e', '!', '\n', 'M', 'i', 'u', 'n', "'", 'l', ' ', 'h', 'g', ' ', 'm', 'a', 'v', 'i', 's', 's', 't', 'h', 'e', 'l', 'f', 'o', '?', '\n', 'T', '3', ';', ' ', 'o', 'u', ',', ' ', 'g', 'e', ' ', 'p', 'i', 'o', 'u', 't', '\n', '\n', 'D', 'a', 'l', ':', ' ', 'n', 't', 'h', 'a', 't', 'e', 'a', 'v', 'e', 'd', ' ', 'f', 'f', 'e', 'r', 'd', ';', 'g', 'l', 'a', 'm', 'O', ':', ',', ' ', 'm', 'a', 'n', ' ', 'f', ' ', 's', 't', ' ', 'h']
