In [2]:
import tiktoken

class GPT_Tokenizer:
   def __init__(self):
      self.enc = tiktoken.get_encoding("gpt2")

   def encode(self, text):
       self.encoding = self.enc.encode(text)
       return self.encoding

   def decode(self, token):
       self.decoding = self.enc.decode(token)
       return self.decoding

GPT-2 tokenizer implementation using OpenAI's tiktoken library.
Provides encoding and decoding functionality compatible with OpenAI's GPT models.

In [4]:
with open("iLoveMerge.txt", "r", encoding="utf-8") as file:
   raw_data = file.read()

Read the text file containing training data for tokenization and model training

In [5]:
text_data = raw_data[:100]
print(text_data)

**Welcome To The World of Free Plain Vanilla Electronic Texts**

**Etexts Readable By Both Humans an


### For demonstration, using the first 100 characters which will be encoded later

In [6]:
tokenizer = GPT_Tokenizer()
encode_text = tokenizer.encode(text_data)
print(encode_text)

[1174, 14618, 1675, 383, 2159, 286, 3232, 28847, 33897, 19508, 8255, 82, 1174, 198, 198, 1174, 36, 5239, 82, 4149, 540, 2750, 5747, 27411, 281]


### Tokenization complete: words converted to tokens using tiktoken

In [7]:
decode_text = tokenizer.decode(encode_text)
print(decode_text)

**Welcome To The World of Free Plain Vanilla Electronic Texts**

**Etexts Readable By Both Humans an


### Decoding reproduces the exact original text that was encoded

## Prepares the dataset for LLM training using PyTorch's DataLoader.

### This structures the data into batches, enabling efficient GPU utilization
### and shuffling. For next-token prediction (causal LM), the target sequence
### is the input sequence shifted right by one position.

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

class Implement_Dataset(Dataset):
    def __init__(self, text, max_length, stride):
        self.input_id = []
        self.target_id = []

        tokenizer = GPT_Tokenizer()
        token_id = tokenizer.encode(text)

        # range should use step=stride, not a second argument
        for i in range(0, len(token_id) - max_length, stride):
            input_seq = token_id[i : i + max_length]
            target_seq = token_id[i + 1 : i + max_length + 1]

            self.input_id.append(input_seq)
            self.target_id.append(target_seq)

    def __len__(self):
        return len(self.input_id)

    def __getitem__(self, idx):
        return torch.tensor(self.input_id[idx]), torch.tensor(self.target_id[idx])


This code implements a custom dataset class for our LLM training. Next, we’ll create a DataLoader to fetch the data in batches and iterate over it during training.

In [9]:
def Implement_DataLoader(txt,batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True,num_workers=0):
      dataset = Implement_Dataset(txt, max_length, stride)
      dataloader = DataLoader(dataset,
                              batch_size= batch_size ,
                              shuffle= shuffle,
                              drop_last= drop_last ,
                              num_workers=num_workers)
      return dataloader

In [10]:
dataloader = Implement_DataLoader(text_data, batch_size=4, max_length=8, stride=4)

data_iter = iter(dataloader)
# first_batch = next(data_iter)
# print(first_batch)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[33897, 19508,  8255,    82,  1174,   198,   198,  1174],
        [   36,  5239,    82,  4149,   540,  2750,  5747, 27411],
        [ 1174, 14618,  1675,   383,  2159,   286,  3232, 28847],
        [ 1174,   198,   198,  1174,    36,  5239,    82,  4149]])

Targets:
 tensor([[19508,  8255,    82,  1174,   198,   198,  1174,    36],
        [ 5239,    82,  4149,   540,  2750,  5747, 27411,   281],
        [14618,  1675,   383,  2159,   286,  3232, 28847, 33897],
        [  198,   198,  1174,    36,  5239,    82,  4149,   540]])


### Implementing Embeddings
### Combines token embeddings with positional encodings to give the model
### information about both word identity and sequence position.

In [11]:
import torch.nn as nn

class Embedding(nn.Module):
      def __init__(self, emb_dim, vocab_size, max_length):
        self.token_emb = nn.Embedding(vocab_size, emb_dim)
        self.pos_emb = nn.Embedding(max_length, emb_dim)

      def forward(self, x):
          positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
          return self.token_emb(x) + self.pos_emb(positions)

### The core of GPT is the attention mechanism inside the Transformer. In the code below, I implement it step by step: first by initializing the query, key, and value projection weights, then by adding causal attention to prevent tokens from looking ahead. I also include dropout to improve generalization and reduce overfitting.

In [12]:

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()

        # Ensure even split across heads
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        # Linear projections for Q, K, V
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

        # Final output projection
        self.out_proj = nn.Linear(d_out, d_out)

        self.dropout = nn.Dropout(dropout)

        # Causal mask (upper triangular = 1 -> masked)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        b, seq_len, _ = x.shape

        # Compute Q, K, V
        Q = self.W_query(x)
        K = self.W_key(x)
        V = self.W_value(x)

        # Reshape for multi-head: (b, seq_len, num_heads, head_dim)
        Q = Q.view(b, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(b, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(b, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Attention scores: (b, num_heads, seq_len, seq_len)
        scores = Q @ K.transpose(2, 3)

        # Causal masking
        mask = self.mask.bool()[:seq_len, :seq_len]
        scores = scores.masked_fill(mask, float("-inf"))

        # Softmax over last dimension
        attn = torch.softmax(scores / (self.head_dim ** 0.5), dim=-1)
        attn = self.dropout(attn)

        # Weighted sum of values
        context = attn @ V   # (b, num_heads, seq_len, head_dim)

        # Merge heads back: (b, seq_len, d_out)
        context = context.transpose(1, 2).contiguous().view(b, seq_len, self.d_out)

        return self.out_proj(context)


In [13]:
import torch
torch.manual_seed(123)


inputs = torch.tensor(
    [[0.43, 0.15, 0.89, 0.55, 0.87, 0.66],  # Row 1
     [0.57, 0.85, 0.64, 0.22, 0.58, 0.33],  # Row 2
     [0.77, 0.25, 0.10, 0.05, 0.80, 0.55]]  # Row 3
)

batch = torch.stack((inputs, inputs), dim=0)
print(batch.shape)

batch_size, context_length, d_in = batch.shape
d_out = 6
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)
context_vecs = mha(batch)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

torch.Size([2, 3, 6])
tensor([[[ 0.1569, -0.0873,  0.0210,  0.0215, -0.3243, -0.2518],
         [ 0.1117, -0.0547,  0.0406, -0.0213, -0.3251, -0.2993],
         [ 0.1196, -0.0491,  0.0318, -0.0635, -0.2788, -0.2578]],

        [[ 0.1569, -0.0873,  0.0210,  0.0215, -0.3243, -0.2518],
         [ 0.1117, -0.0547,  0.0406, -0.0213, -0.3251, -0.2993],
         [ 0.1196, -0.0491,  0.0318, -0.0635, -0.2788, -0.2578]]],
       grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([2, 3, 6])


In [14]:
text_demo = "Hello Every one"
tokenizer = GPT_Tokenizer()
tokenized = tokenizer.encode(text_demo)
print(tokenized)

[15496, 3887, 530]


In [15]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

Now let's implement layer normalization

In [16]:
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


In [17]:
import torch
batch_example = torch.randn(2, 5) #A

In [18]:
print(batch_example)

tensor([[-0.6669,  0.5074, -1.1026, -0.3533, -1.7799],
        [ 0.6474,  0.5460,  0.8050, -1.3467, -0.6418]])


In [19]:
import torch.nn as nn

ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)

print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[-5.9605e-09],
        [-4.7684e-08]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [20]:

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [21]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), ## Expansion
            GELU(), ## Activation
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), ## Contraction
        )

    def forward(self, x):
        return self.layers(x)

In [22]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        # 2*4*768
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x
        # 2*4*768

In [23]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [26]:
import tiktoken
tokenizer = GPT_Tokenizer()
batch = []
txt1 = "Your hard work leads to"
txt2 = "hard work is a key"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[ 7120,  1327,   670,  5983,   284],
        [10424,   670,   318,   257,  1994]])


In [27]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[ 7120,  1327,   670,  5983,   284],
        [10424,   670,   318,   257,  1994]])

Output shape: torch.Size([2, 5, 50257])
tensor([[[ 0.8715,  0.7359, -0.4033,  ...,  0.3783,  0.0976, -1.1895],
         [ 0.6399, -0.3078,  0.5744,  ...,  0.2013,  0.4935, -0.1959],
         [ 1.1128,  0.4172, -0.7705,  ...,  0.4772, -0.7034, -0.9059],
         [-0.2190,  0.1247,  0.4820,  ...,  0.6499, -0.5742, -0.4945],
         [ 0.8533, -0.3853, -0.5923,  ..., -0.6437,  0.0973, -0.7483]],

        [[-0.3444, -0.3724,  0.0283,  ..., -0.1598,  0.0271,  0.0101],
         [ 0.0273, -0.5396, -1.3966,  ..., -0.1147, -0.5108, -0.5886],
         [ 1.3964, -0.7803, -0.2001,  ...,  0.5523,  0.2406, -0.1905],
         [-0.3339,  0.0140,  0.4530,  ...,  1.0095, -0.5696, -0.1425],
         [ 0.2232, -0.0840, -0.0050,  ...,  0.2683,  0.1999,  0.4608]]],
       grad_fn=<UnsafeViewBackward0>)


In [29]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):

        # then only the last context_size tokens are used as context 10 token 5
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)

        # Focus only on the last time step
        logits = logits[:, -1, :]
        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)
        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [30]:
start_context = "Hello,I'm a developer"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0) #A
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 40, 1101, 257, 8517]
encoded_tensor.shape: torch.Size([1, 6])


In [32]:
model.eval() #A
#model = GPTModel(GPT_CONFIG_124M)
out = generate_text_simple(
model=model,
idx=encoded_tensor,
max_new_tokens=10,
context_size=GPT_CONFIG_124M["context_length"]
)
print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,    40,  1101,   257,  8517,  3271, 46935, 17060, 39258,
           297, 49965,  7385, 38855,  2046,  4344]])
Output length: 16


In [33]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello,I'm a developer David vaPutinれll GAM KirAccept fire adop
