<a href="https://colab.research.google.com/github/kumar-sanchay/llm-model-code/blob/main/gpt2_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt")
file_path = "the-verdict.txt"

urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x7ef51c0bd400>)

In [2]:
import tiktoken

In [3]:
tokenizer = tiktoken.get_encoding('gpt2')

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt)

    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i: i + max_length]
      target_chunk = token_ids[i + 1: i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [5]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
  tokenizer = tiktoken.get_encoding('gpt2')
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
  dataloader = DataLoader(
      dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last,
      num_workers=num_workers,
  )

  return dataloader

In [6]:
raw_text = None
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
  raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

In [7]:
vocab_size = 50257
output_dim = 256

In [8]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [9]:
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [10]:
token_embeddings = embedding_layer(inputs)

In [11]:
token_embeddings.shape

torch.Size([8, 4, 256])

In [12]:
context_length = 4
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))

In [13]:
pos_embeddings.shape

torch.Size([4, 256])

In [14]:
input_embeddings = token_embeddings + pos_embeddings

In [15]:
input_embeddings.shape

torch.Size([8, 4, 256])

In [16]:
inputs[0]

tensor([  40,  367, 2885, 1464])

In [17]:
token_embeddings.shape

torch.Size([8, 4, 256])

In [18]:
token_embeddings[0][0]

tensor([-6.3964e-02,  3.3174e-01,  1.0698e-01, -1.4281e-01, -3.0141e-01,
        -6.3647e-01, -2.2577e+00,  8.0754e-01,  1.2420e+00, -7.9977e-01,
         1.3265e+00, -1.3640e+00, -1.4752e+00, -1.0095e-01, -1.5406e+00,
        -9.0417e-01, -1.2689e-01,  9.2804e-01,  5.6544e-01, -1.8898e-01,
        -1.2679e-01,  4.5306e-01, -7.6017e-01, -1.0059e+00, -9.2078e-01,
         8.7217e-01,  1.8234e+00, -1.9416e+00,  1.0657e+00,  3.9532e-01,
         5.7309e-01, -7.7677e-01, -1.3191e+00, -6.6007e-01,  4.9130e-01,
         1.1239e+00,  1.4588e+00, -3.6530e-01, -4.0372e-02,  1.9042e-01,
        -1.1572e+00, -1.1329e+00,  2.7644e-01,  6.3706e-01, -1.5947e+00,
        -2.6210e-02, -1.4804e+00, -1.0301e+00, -1.3259e+00, -8.9009e-01,
         1.8119e+00,  1.9705e+00,  1.6373e-01, -5.1866e-01,  5.2069e-01,
        -1.0986e+00,  1.3451e+00, -2.5956e-01,  1.2858e+00, -1.9670e+00,
         9.2041e-02,  4.5699e-01, -1.4753e+00, -4.4934e-01,  1.8373e+00,
        -1.1507e+00, -6.0601e-01, -5.3533e-01, -2.9

In [19]:
pos_embeddings[0]

tensor([-1.4150, -0.3142,  0.2827, -1.3217,  0.1522,  1.4696, -0.7081, -0.8260,
         0.9868, -1.0706, -1.9527,  0.3350,  0.7528,  1.2890, -1.8265,  1.3226,
         0.7963,  0.7234,  0.2391,  0.5925,  1.9121,  1.6565,  0.3007, -0.9644,
         1.4145, -0.6995,  1.8648,  1.8325,  2.4375,  0.4970,  0.9365, -0.2091,
         0.7504,  0.7353,  1.7375, -0.5620, -0.6303, -0.4848, -0.1366,  1.7588,
        -1.2587, -0.4871, -0.8335,  1.0413, -0.7001,  0.2432,  0.1836, -0.5901,
         0.0174, -0.7633,  0.8106, -0.7827, -0.7718,  1.2198, -0.4126,  1.6117,
        -0.0736,  0.1487, -0.6539,  1.8390,  1.3469, -0.3277,  0.8704, -0.3255,
        -1.4792, -0.1963, -0.6116,  0.6067, -1.1067,  1.0557,  0.6327,  0.6573,
        -0.5459, -0.5453,  0.9371, -0.4495,  0.3675, -0.2097,  1.1318,  1.6343,
        -1.5092, -0.0614, -2.6343,  1.1810,  0.1611, -0.3978, -0.5137,  0.3546,
         0.1013,  0.5953,  0.6264, -0.1469,  1.4578, -1.0714, -0.4406,  0.5838,
        -1.6059, -1.3471,  1.1953,  0.32

In [20]:
import torch.nn as nn

In [21]:
class SelfAttention_v1(nn.Module):
  def __init__(self, d_in, d_out):
    super().__init__()
    self.w_query = nn.Parameter(torch.rand(d_in, d_out))
    self.w_key = nn.Parameter(torch.rand(d_in, d_out))
    self.w_value = nn.Parameter(torch.rand(d_in, d_out))

  def forward(self, x):
    keys = x @ self.w_key
    query = x @ self.w_query
    value = x @ self.w_value

    attention_score = query @ keys.T
    attention_weights = torch.softmax(attention_score / keys.shape[-1] ** 0.5, dim=-1)
    context_vector = attention_weights @ value
    return context_vector

In [22]:
torch.manual_seed(123)
inputs = torch.rand(3, 2)
self_attn = SelfAttention_v1(2, 3)
print(inputs)
print(self_attn(inputs))

tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])
tensor([[0.4905, 0.5777, 0.8576],
        [0.4909, 0.5780, 0.8582],
        [0.4913, 0.5782, 0.8587]], grad_fn=<MmBackward0>)


In [23]:
class SelfAttention_v2(nn.Module):
  def __init__(self, d_in, d_out):
    super().__init__()
    self.w_query = nn.Linear(d_in, d_out, bias=False)
    self.w_key = nn.Linear(d_in, d_out, bias=False)
    self.w_value = nn.Linear(d_in, d_out, bias=False)

  def forward(self, x):
    keys = self.w_key(x)
    query = self.w_query(x)
    value = self.w_value(x)

    attn_score = query @ keys.T
    attn_weights = torch.softmax(attn_score / keys.shape[-1] ** 0.5, dim=-1)
    context_vector = attn_weights @ value
    return context_vector

In [24]:
torch.manual_seed(123)
inputs = torch.rand(3, 2)
self_attn = SelfAttention_v2(2, 3)
print(inputs)
print(self_attn(inputs))

tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])
tensor([[0.1224, 0.1945, 0.5119],
        [0.1226, 0.1944, 0.5122],
        [0.1229, 0.1942, 0.5127]], grad_fn=<MmBackward0>)


In [25]:
class CasualAttention(nn.Module):
  def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):
    super().__init__()
    self.w_query = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.w_key = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.w_value = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.dropout = nn.Dropout(dropout)
    self.mask = torch.triu(torch.ones(context_length, context_length))

  def forward(self, x):
    b, num_tokens, d_in = x.shape
    keys = self.w_key(x)
    query = self.w_query(x)
    value = self.w_value(x)

    attn_score = query @ keys.transpose(1, 2)
    attn_score.masked_fill_(self.mask[:num_tokens, :num_tokens] == 0, float('-inf'))
    attn_weights = torch.softmax(attn_score / keys.shape[-1] ** 0.5, dim=-1)
    att_weights = self.dropout(attn_weights)
    context_vector = attn_weights @ value
    return context_vector

In [26]:
torch.manual_seed(123)
inputs = torch.stack([torch.rand(3, 2) for _ in range(2)])
self_attn = CasualAttention(2, 3, 3, 0.5)
print(inputs)
print(self_attn(inputs))

tensor([[[0.2961, 0.5166],
         [0.2517, 0.6886],
         [0.0740, 0.8665]],

        [[0.1366, 0.1025],
         [0.1841, 0.7264],
         [0.3153, 0.6871]]])
tensor([[[ 0.0883,  0.2832, -0.3324],
         [ 0.1222,  0.3416, -0.4148],
         [ 0.1705,  0.4146, -0.5231]],

        [[ 0.0452,  0.1875, -0.2083],
         [ 0.0787,  0.2786, -0.3197],
         [ 0.0534,  0.2480, -0.2699]]], grad_fn=<UnsafeViewBackward0>)


In [27]:
class MultiHeadAttentionWrapper(nn.Module):
  def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
    super().__init__()
    self.heads = [CasualAttention(d_in, d_out, context_length, dropout, qkv_bias) for _ in range(num_heads)]

  def forward(self, x):
    return torch.cat([head(x) for head in self.heads], dim=-1)

In [28]:
torch.manual_seed(123)
inputs = torch.stack([torch.rand(3, 2) for _ in range(2)])
self_attn = MultiHeadAttentionWrapper(2, 3, 3, 0.5, 2)
print(inputs)
print(self_attn(inputs))

tensor([[[0.2961, 0.5166],
         [0.2517, 0.6886],
         [0.0740, 0.8665]],

        [[0.1366, 0.1025],
         [0.1841, 0.7264],
         [0.3153, 0.6871]]])
tensor([[[ 8.8264e-02,  2.8325e-01, -3.3242e-01,  4.5025e-01,  4.0326e-03,
           5.4171e-02],
         [ 1.2216e-01,  3.4159e-01, -4.1480e-01,  5.1136e-01,  8.4759e-03,
           6.8762e-02],
         [ 1.7049e-01,  4.1461e-01, -5.2314e-01,  5.7574e-01,  1.5328e-02,
           8.8167e-02]],

        [[ 4.5161e-02,  1.8753e-01, -2.0833e-01,  3.2283e-01, -5.0505e-04,
           3.2531e-02],
         [ 7.8657e-02,  2.7858e-01, -3.1973e-01,  4.5765e-01,  1.9947e-03,
           5.1152e-02],
         [ 5.3358e-02,  2.4804e-01, -2.6991e-01,  4.4004e-01, -2.1567e-03,
           4.1614e-02]]], grad_fn=<CatBackward0>)


In [29]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [30]:
class DummyGPTModel(nn.Module):

  def __init__(self, cfg):
    super().__init__()
    self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
    self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
    self.drop_emb = nn.Dropout(cfg["drop_rate"])
    self.trf_blocks = nn.Sequential(
        [DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
    )
    self.final_norm = DummyLayerNorm(cfg["emb_dim"])
    self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=True)

  def forward(self, in_idx):
    batch_size, seq_len = in_idx.shape
    tok_embeds = self.tok_emb(in_idx)
    pos_embeds = self.pos_emb(
        torch.arange(seq_len, device=in_idx.device)
    )
    x = tok_embeds + pos_embeds
    x = self.drop_emb(x)
    x = self.trf_blocks(x)
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits


In [31]:
class DummyTransformerBlock(nn.Module):
  def __init__(self, cfg):
    super().__init__()

  def forward(self, x):
    return x

In [32]:
class DummyLayerNorm(nn.Module):
  def __init__(self, normalized_shape, eps=1e-5):
    super().__init__()

  def forward(self, x):
    return x

In [33]:
class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(keepdim=True, dim=-1)
    var = x.var(keepdim=True, dim=-1, unbiased=False)
    norm_x = (x - mean) / torch.sqrt(var + self.eps)
    return self.scale * norm_x + self.shift

In [34]:
batch_example = torch.rand(2, 5)

In [35]:
batch_example

tensor([[0.3443, 0.6800, 0.9998, 0.2855, 0.9753],
        [0.2518, 0.7204, 0.6959, 0.6397, 0.8954]])

In [36]:
ln = LayerNorm(5)
ln_out = ln(batch_example)

In [37]:
ln_out

tensor([[-1.0363,  0.0763,  1.1361, -1.2312,  1.0550],
        [-1.8307,  0.3756,  0.2601, -0.0045,  1.1995]], grad_fn=<AddBackward0>)

In [38]:
ln_out.mean(keepdim=True, dim=-1)

tensor([[-4.7684e-08],
        [-1.0775e-07]], grad_fn=<MeanBackward1>)

In [39]:
ln_out.var(keepdim=True, dim=-1, unbiased=False)

tensor([[0.9999],
        [0.9998]], grad_fn=<VarBackward0>)

Let's first look at Batch Normalization. Batch normalization normalizes the activations across the batch dimension.

In [40]:
import torch
import torch.nn as nn

# Example with Batch Normalization
batch_norm_example = torch.randn(4, 3, 2) # Batch size = 4, features = 3, sequence length = 2
batch_norm_layer = nn.BatchNorm1d(3) # Normalize across the feature dimension
batch_norm_out = batch_norm_layer(batch_norm_example) # Apply BatchNorm1d across features

print("Batch Norm Input Shape:", batch_norm_example.shape)
print("Batch Norm Output Shape:", batch_norm_out.shape)
print("\nBatch Norm Input:\n", batch_norm_example)
print("\nBatch Norm Output:\n", batch_norm_out)

Batch Norm Input Shape: torch.Size([4, 3, 2])
Batch Norm Output Shape: torch.Size([4, 3, 2])

Batch Norm Input:
 tensor([[[-0.8016, -0.8183],
         [-1.1820, -0.2877],
         [-0.6043,  0.6002]],

        [[-1.4205, -0.2238],
         [ 0.9474,  1.6834],
         [ 0.5663,  1.0306]],

        [[-0.3047,  1.6873],
         [ 0.6851,  2.0024],
         [ 1.2118, -1.2076]],

        [[-0.3016, -0.7074],
         [-0.1465, -0.4943],
         [-1.1766, -2.0524]]])

Batch Norm Output:
 tensor([[[-0.5141, -0.5336],
         [-1.5206, -0.6615],
         [-0.3528,  0.7088]],

        [[-1.2368,  0.1606],
         [ 0.5249,  1.2319],
         [ 0.6789,  1.0881]],

        [[ 0.0661,  2.3922],
         [ 0.2729,  1.5383],
         [ 1.2478, -0.8845]],

        [[ 0.0697, -0.4042],
         [-0.5259, -0.8600],
         [-0.8572, -1.6291]]], grad_fn=<NativeBatchNormBackward0>)


Now let's look at Layer Normalization. Layer normalization normalizes the activations across the feature dimension.

In [41]:
import torch
import torch.nn as nn

# Example with Layer Normalization
layer_norm_example = torch.randn(4, 3, 2) # Batch size = 4, features = 3, sequence length = 2
layer_norm_layer = nn.LayerNorm(2) # Normalize across the sequence length dimension
layer_norm_out = layer_norm_layer(layer_norm_example)

print("Layer Norm Input Shape:", layer_norm_example.shape)
print("Layer Norm Output Shape:", layer_norm_out.shape)
print("\nLayer Norm Input:\n", layer_norm_example)
print("\nLayer Norm Output:\n", layer_norm_out)

Layer Norm Input Shape: torch.Size([4, 3, 2])
Layer Norm Output Shape: torch.Size([4, 3, 2])

Layer Norm Input:
 tensor([[[ 1.0318, -0.4272],
         [-1.1454, -1.3316],
         [ 0.2230,  0.6463]],

        [[ 0.1538, -0.4452],
         [-0.2721, -0.3510],
         [ 1.1152, -0.6172]],

        [[-2.2708, -1.3819],
         [-0.8484,  0.5323],
         [-0.4053,  0.7086]],

        [[ 0.9533, -0.0130],
         [-0.1301, -0.0877],
         [ 0.4187, -1.1123]]])

Layer Norm Output:
 tensor([[[ 1.0000, -1.0000],
         [ 0.9994, -0.9994],
         [-0.9999,  0.9999]],

        [[ 0.9999, -0.9999],
         [ 0.9968, -0.9968],
         [ 1.0000, -1.0000]],

        [[-1.0000,  1.0000],
         [-1.0000,  1.0000],
         [-1.0000,  1.0000]],

        [[ 1.0000, -1.0000],
         [-0.9891,  0.9891],
         [ 1.0000, -1.0000]]], grad_fn=<NativeLayerNormBackward0>)


Here's a breakdown of the differences:

In [42]:
class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x):
    return 0.5 * x * (1 + torch.tanh(
        torch.sqrt(torch.tensor(2 / torch.pi)) *
        (x + 0.044715 * torch.pow(x, 3))
    ))

In [43]:
gelu = GELU()
batch_out = gelu(batch_example)

In [44]:
batch_out

tensor([[0.2185, 0.5112, 0.8409, 0.1748, 0.8145],
        [0.1509, 0.5506, 0.5265, 0.4725, 0.7294]])

In [45]:
batch_example

tensor([[0.3443, 0.6800, 0.9998, 0.2855, 0.9753],
        [0.2518, 0.7204, 0.6959, 0.6397, 0.8954]])

In [46]:
relu = nn.ReLU()
relu_batch = relu(batch_example)
relu_batch

tensor([[0.3443, 0.6800, 0.9998, 0.2855, 0.9753],
        [0.2518, 0.7204, 0.6959, 0.6397, 0.8954]])

In [47]:
class FeedForward(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']),
        GELU(),
        nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim'])
    )

  def forward(self, x):
    return self.layers(x)

In [48]:
class MultiHeadAttention(nn.Module):

  def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
    super().__init__()
    assert (d_out % num_heads) == 0, "d_out must be divisible by num_heads"

    self.d_out = d_out
    self.num_heads = num_heads
    self.head_dim = d_out // num_heads
    self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
    self.out_proj = nn.Linear(d_out, d_out)
    self.dropout = nn.Dropout(dropout)
    self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

  def forward(self, x):
    b, num_tokens, d_in = x.shape
    keys = self.W_key(x)
    queries = self.W_query(x)
    values = self.W_value(x)

    keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
    queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
    values = values.view(b, num_tokens, self.num_heads, self.head_dim)

    keys = keys.transpose(1, 2)
    queries = queries.transpose(1, 2)
    values = values.transpose(1, 2)

    attn_score = queries @ keys.transpose(2, 3)
    mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
    attn_score.masked_fill_(mask_bool, float('-inf'))
    attn_weights = torch.softmax(attn_score / keys.shape[-1] ** 0.5, dim=-1)

    attn_weights = self.dropout(attn_weights)
    context_vector = (attn_weights @ values).transpose(1, 2)
    context_vector = context_vector.contiguous().view(b, num_tokens, self.d_out)
    context_vector = self.out_proj(context_vector)
    return context_vector

In [49]:
batch_example = torch.rand(2, 5, 10)

batch_size, context_length, d_in = batch_example.shape
d_out = 2

mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, 2)
context_vector = mha(batch_example)

context_vector

tensor([[[-0.3639,  0.2417],
         [-0.3602,  0.1461],
         [-0.3934,  0.1237],
         [-0.3770,  0.2026],
         [-0.3503,  0.2383]],

        [[-0.4807,  0.2758],
         [-0.4407,  0.2398],
         [-0.3629,  0.3368],
         [-0.3285,  0.3899],
         [-0.3268,  0.3660]]], grad_fn=<ViewBackward0>)

In [50]:
class TransformerBlock(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.att = MultiHeadAttention(
        cfg['emb_dim'],
        cfg['emb_dim'],
        cfg['context_length'],
        cfg['drop_rate'],
        cfg['n_heads'],
        qkv_bias=True
    )
    self.ff = FeedForward(cfg)
    self.norm1 = LayerNorm(cfg["emb_dim"])
    self.norm2 = LayerNorm(cfg["emb_dim"])
    self.drop_shortcut = nn.Dropout(cfg['drop_rate'])

  def forward(self, x):
    shortcut = x
    x = self.norm1(x)
    x = self.att(x)
    x = self.drop_shortcut(x)
    x = x + shortcut

    shortcut = x
    x = self.norm2(x)
    x = self.ff(x)
    x = self.drop_shortcut(x)
    x = x + shortcut
    return x


In [51]:
x = torch.rand(2, 4, 768)

In [52]:
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)

In [53]:
output.shape

torch.Size([2, 4, 768])

In [54]:
class GPTModel(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
    self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
    self.drop_emb = nn.Dropout(cfg["drop_rate"])
    self.trf_blocks = nn.Sequential(
        *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
    )
    self.final_norm = LayerNorm(cfg["emb_dim"])
    self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=True)

  def forward(self, in_idx):
    batch_size, seq_len = in_idx.shape
    tok_embeds = self.tok_emb(in_idx)
    pos_embeds = self.pos_emb(
        torch.arange(seq_len, device=in_idx.device)
    )
    x = tok_embeds + pos_embeds
    x = self.drop_emb(x)
    x = self.trf_blocks(x)
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits

In [55]:
batch = torch.rand(2, 4)
batch

tensor([[0.3670, 0.8510, 0.0659, 0.5853],
        [0.4178, 0.4050, 0.0044, 0.6963]])

In [56]:
import torch

batch = torch.randint(0, GPT_CONFIG_124M['vocab_size'], (2, 4), dtype=torch.long)
print(batch)

tensor([[12535, 10628,  9273, 39066],
        [20359, 13568,  4977,  1049]])


In [57]:
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print(out.shape)

torch.Size([2, 4, 50257])


In [58]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
  for _ in range(max_new_tokens):
    idx_cond = idx[:, -context_size:]
    with torch.no_grad():
      logits = model(idx_cond)
    logits = logits[:, -1, :]
    probs = torch.softmax(logits, dim=-1)
    idx_next = torch.argmax(probs, dim=-1, keepdim=True)
    idx = torch.cat((idx, idx_next), dim=1)
  return idx

In [59]:
start_context = "Hello I'm a"
encoded = tokenizer.encode(start_context)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
encoded_tensor

tensor([[15496,   314,  1101,   257]])

In [60]:
model.eval()
out = generate_text_simple(model, encoded_tensor, 6, context_size=GPT_CONFIG_124M['context_length'])
out

tensor([[15496,   314,  1101,   257, 41565, 10069, 17722, 13310,  9956, 39131]])

In [61]:
decoded_text = tokenizer.decode(out[0].tolist())
print(decoded_text)

Hello I'm aPod reveals Mik pending Eth Crom


In [62]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [63]:
def text_to_token_ids(text, tokenizer):
  encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
  encoded_tensor = torch.tensor(encoded).unsqueeze(0)
  return encoded_tensor

In [64]:
def token_ids_to_text(token_ids, tokenizer):
  flat = token_ids.squeeze(0)
  return tokenizer.decode(flat.tolist())

In [65]:
start_context = "Every effort moves you"
token_ids = generate_text_simple(
    model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M['context_length']
)
print('Output: ' + token_ids_to_text(token_ids, tokenizer))

Output: Every effort moves you Whateveribility horrified babrog422kind likelihood reinstated meta


In [66]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  text_data = f.read()

In [67]:
print(len(text_data))

20479


In [68]:
train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [69]:
print(len(train_data))
print(len(val_data))

18431
2048


In [70]:
train_loader = create_dataloader_v1(
    train_data,
    2,
    GPT_CONFIG_124M['context_length'],
    GPT_CONFIG_124M['context_length'],
    True,
    True,
    0
)

val_loader = create_dataloader_v1(
    val_data,
    2,
    GPT_CONFIG_124M['context_length'],
    GPT_CONFIG_124M['context_length'],
    False,
    False,
    0
)

In [71]:
for x, y in train_loader:
  print(x.shape)
  print(y.shape)

torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])
torch.Size([2, 256])


In [72]:
def calc_loss_batch(input_batch, target_batch, model, device):
  input_batch = input_batch.to(device)
  target_batch = target_batch.to(device)
  logits = model(input_batch)
  loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
  return loss

In [73]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
  total_loss = 0
  if len(data_loader) == 0:
    return float('nan')
  elif num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))

  for i, (input, target) in enumerate(data_loader):
    if i > num_batches:
      break
    loss = calc_loss_batch(input, target, model, device)
    total_loss += loss.item()
  return total_loss / num_batches

In [74]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
model.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [None]:
with torch.no_grad():
  train_loss = calc_loss_loader(train_loader, model, device)
  val_loss = calc_loss_loader(val_loader, model, device)
print(train_loss)
print(val_loss)

11.02022616068522
11.024858474731445


In [None]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
  train_losses, val_losses, track_tokens_seen = [], [], []
  tokens_seen, global_step = 0, -1

  for epoch in range(num_epochs):
    model.train()
    for input_batch, target_batch in train_loader:
      optimizer.zero_grad()
      loss = calc_loss_batch(input_batch, target_batch, model, device)
      loss.backward()
      optimizer.step()

      tokens_seen += input_batch.numel()
      global_step += 1

      if global_step % eval_freq == 0:
        train_loss, val_loss = evaluate_model(model, train_loader, val_loader,
                                              device, eval_iter)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        track_tokens_seen.append(tokens_seen)

        print(f"Epoch: {epoch}, Step: {global_step}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
  return train_losses, val_losses, track_tokens_seen

In [None]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
  model.eval()

  with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, eval_iter)
    val_loss = calc_loss_loader(val_loader, model, device, eval_iter)
    model.train()
  return train_loss, val_loss

In [None]:
# model = GPTModel(GPT_CONFIG_124M)
# model.to(device)
# optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

# num_epochs = 10
# train_losses, val_losses, tokens_seen = train_model_simple(model, train_loader, val_loader,
#                                                            optimizer, device, num_epochs, 5, 5,
#                                                            "Every effort moves you", tokenizer=tokenizer)

In [None]:
# torch.save({
#     "model_state_dict": model.state_dict(),
#     "optimizer_state_dict": optimizer.state_dict()
# }, "gpt_model_and_optimizer.pth"
# # )

In [None]:
# torch.save(model.state_dict(), "gpt_model.pth")

In [None]:
# checkpoint = torch.load("/content/drive/MyDrive/gpt_model_and_optimizer.pth")
# model = GPTModel(GPT_CONFIG_124M)
# model.load_state_dict(checkpoint["model_state_dict"])
# optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
# optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
# model.train()

In [None]:
# tokenizer = tiktoken.get_encoding("gpt2")
# token_ids = generate_text_simple(
#     model = model,
#     idx = text_to_token_ids("Every effort moves you", tokenizer),
#     max_new_tokens = 25,
#     context_size = GPT_CONFIG_124M['context_length']
# )

In [None]:
# print(token_ids_to_text(token_ids, tokenizer))

In [75]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
  for _ in range(max_new_tokens):
    idx_cond = idx[:, -context_size:]
    with torch.no_grad():
      logits = model(idx_cond)
    logits = logits[:, -1, :]

    if top_k is not None:
      top_logits, _ = torch.topk(logits, top_k)
      min_val = top_logits[:, -1]
      logits = torch.where(logits < min_val,
                           torch.tensor(float('-inf')).to(logits.device),
                           logits)

      if temperature > 0.0:
        logits = logits / temperature
        probs = torch.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
      else:
        idx_next = torch.argmax(logits, dim=-1, keepdim=True)

      if idx_next == eos_id:
        break
      idx = torch.cat((idx, idx_next), dim=1)
  return idx


In [76]:
token_ids = generate(
    model = model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M['context_length'],
    temperature=1.2,
    top_k=25
)

In [77]:
print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you trafficï InteriorcularVAL statist ecosystemstheirrf Attempt


Let's install pre-train gpt 2 model

In [79]:
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch05/01_main-chapter-code/gpt_download.py"
)

filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

('gpt_download.py', <http.client.HTTPMessage at 0x7ef51ca62630>)

In [None]:
# pip install tensorflow==2.15.0

In [None]:
# pip install tqdm >= 4.66

In [80]:
from gpt_download import download_and_load_gpt2

In [81]:
settings, params = download_and_load_gpt2(
    model_size="124M", models_dir="gpt2"
)

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 111kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.80MiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 164kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [00:35<00:00, 14.0MiB/s]
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 7.07MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 2.01MiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 1.97MiB/s]


In [82]:
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25}
}

In [83]:
model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])

In [84]:
NEW_CONFIG.update({"context_length": 1024})

In [85]:
NEW_CONFIG.update({"qkv_bias": True})

In [163]:
gpt = GPTModel(NEW_CONFIG)
gpt.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [87]:
import numpy as np

In [88]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch: {left.shape} {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [89]:
def load_weights_into_gpt(gpt, pramas):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params['blocks'])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1
        )
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T
        )
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T
        )
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T
        )

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1
        )
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b
        )
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b
        )
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b
        )

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T
        )
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"]
        )

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T
        )
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"]
        )

        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T
        )
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"]
        )

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"]
        )
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"]
        )
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"]
        )
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"]
        )

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])



In [164]:
load_weights_into_gpt(gpt, params)

In [165]:
gpt.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [166]:
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("How are you?", tokenizer).to(device),
    max_new_tokens=50,
    context_size=NEW_CONFIG["context_length"],
    top_k=10,
    temperature=1.3
)
token_ids_to_text(token_ids, tokenizer)

"How are you?\n\nWhat's your life like today? What are the challenges of doing your work on your own? Are your children happy? What would a good teacher do to help you become a better person?\n\nI hope you find this useful and if"

In [95]:
import pandas as pd

In [171]:
df = pd.read_csv('/content/owasp_log_maturity_dataset_2000_realistic.csv')

In [172]:
df

Unnamed: 0,input,label
0,[ERROR] 2025-10-16 20:58:58 AuthService - Toke...,0
1,[INFO] 2025-10-16 20:58:58 LoginController - R...,1
2,[AUDIT] 2025-10-16 20:58:58 AuthService - Unau...,0
3,[ERROR] 2025-10-16 20:58:58 IAM - Password upd...,1
4,[ALERT] 2025-10-16 20:58:58 IAM - Session expi...,0
...,...,...
1995,[SECURITY] 2025-10-16 20:58:58 IAM - Token inv...,0
1996,[DEBUG] 2025-10-16 20:58:58 LoginController - ...,0
1997,[ALERT] 2025-10-16 20:58:58 AuthService - Logi...,1
1998,[WARN] 2025-10-16 20:58:58 AuthService - Unkno...,0


In [173]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,1000
1,1000


In [100]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [101]:
df

Unnamed: 0,input,label
0,[AUDIT] 2025-10-16 19:03:39 Gateway - Invalid ...,0
1,[DEBUG] 2025-10-16 19:03:39 AuthService - Role...,1
2,[DEBUG] 2025-10-16 19:03:39 SecurityModule - U...,0
3,[SECURITY] 2025-10-16 19:03:39 IAM - API token...,1
4,[WARN] 2025-10-16 19:03:39 UserAPI - Login failed,0
...,...,...
1995,[ALERT] 2025-10-16 19:03:39 IAM - Password cha...,0
1996,[ALERT] 2025-10-16 19:03:39 Gateway - Login fa...,0
1997,[INFO] 2025-10-16 19:03:39 Gateway - Login fai...,1
1998,[ERROR] 2025-10-16 19:03:39 Gateway - Login su...,0


In [102]:
def random_split(df, train_frac, validation_frac):

  df = df.sample(
      frac=1, random_state=42
  ).reset_index(drop=True)

  train_end = int(len(df) * train_frac)
  val_end = train_end + int(len(df) * validation_frac)

  train_df = df[:train_end]
  val_df = df[train_end: val_end]
  test_df = df[val_end:]

  return train_df, val_df, test_df

In [174]:
train_df, validation_df, test_df = random_split(df, 0.7, 0.15)

In [175]:
train_df

Unnamed: 0,input,label
0,[INFO] 2025-10-16 20:58:58 UserAPI - Role chan...,0
1,[SECURITY] 2025-10-16 20:58:58 IAM - Role chan...,0
2,[WARN] 2025-10-16 20:58:58 LoginController - A...,1
3,[DEBUG] 2025-10-16 20:58:58 IAM - Permission d...,0
4,[SECURITY] 2025-10-16 20:58:58 SecurityModule ...,1
...,...,...
1395,[ERROR] 2025-10-16 20:58:58 UserAPI - Unauthor...,1
1396,[INFO] 2025-10-16 20:58:58 AuthService - API t...,1
1397,[WARN] 2025-10-16 20:58:58 UserAPI - Unauthori...,0
1398,[ERROR] 2025-10-16 20:58:58 UserAPI - Invalid ...,0


In [105]:
validation_df

Unnamed: 0,input,label
1400,[SECURITY] 2025-10-16 19:03:39 SecurityModule ...,0
1401,[ERROR] 2025-10-16 19:03:39 AuthService - Perm...,0
1402,[ALERT] 2025-10-16 19:03:39 Gateway - Login su...,0
1403,[SECURITY] 2025-10-16 19:03:39 UserAPI - Acces...,0
1404,[ERROR] 2025-10-16 19:03:39 SecurityModule - U...,0
...,...,...
1695,[ERROR] 2025-10-16 19:03:39 Gateway - Token de...,1
1696,[SECURITY] 2025-10-16 19:03:39 Gateway - Unkno...,0
1697,[ALERT] 2025-10-16 19:03:39 Gateway - User 'ch...,1
1698,[INFO] 2025-10-16 19:03:39 Gateway - Invalid auth,0


In [176]:
train_df.to_csv('train.csv', index=False)
validation_df.to_csv('validation.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [107]:
import torch
from torch.utils.data import Dataset

In [108]:
class LogDataSet(Dataset):

  def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):

    self.data = pd.read_csv(csv_file)

    self.encoded_texts = [
        tokenizer.encode(text) for text in self.data['input']
    ]

    if max_length is None:
      self.max_length = max(len(encoded_text) for encoded_text in self.encoded_texts)
    else:
      self.max_length = max_length

    self.encoded_texts = [
        encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
        for encoded_text in self.encoded_texts
    ]

  def __getitem__(self, index):

    encoded = self.encoded_texts[index]
    label = self.data.iloc[index]['label']
    return (
        torch.tensor(encoded, dtype=torch.long),
        torch.tensor(label, dtype=torch.long)
    )

  def __len__(self):
    return len(self.data)

In [177]:
train_dataset = LogDataSet('train.csv', tokenizer, max_length=None)
validation_dataset = LogDataSet('validation.csv', tokenizer, max_length=None)
test_dataset = LogDataSet('test.csv', tokenizer, max_length=None)

In [178]:
len(train_dataset)

1400

In [179]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False)

In [180]:
for input_val, target_val in train_loader:
  pass
print('Input batch dim: ', input_val.shape)
print('Label batch dim: ', target_val.shape)

Input batch dim:  torch.Size([8, 147])
Label batch dim:  torch.Size([8])


In [114]:
len(train_loader)

175

In [115]:
text = (
    "Is the following log owasp complient ? Answer with Yes and No:"
    "'[DEBUG] 2025-10-16 19:03:39 UserAPI - Unauthorized access attempt by userID=629 to resource /admin/config'"
)

In [124]:
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids(text, tokenizer).to(device),
    max_new_tokens=50,
    context_size=NEW_CONFIG["context_length"],
    top_k=1,
    temperature=0.2
)

In [125]:
print(token_ids_to_text(token_ids, tokenizer))

Is the following log owasp complient ? Answer with Yes and No:'[DEBUG] 2025-10-16 19:03:39 UserAPI - Unauthorized access attempt by userID=629 to resource /admin/config' [DEBUG] 2025-10-16 19:03:39 UserAPI - Unauthorized access attempt by userID=629 to resource /admin/config' [DEBUG] 2025-10-16 19:03:39 UserAPI - Unauthorized


In [167]:
for param in gpt.parameters():
  param.requires_grad = False

In [168]:
torch.manual_seed(123)
num_classes = 2

gpt.out_head = torch.nn.Linear(NEW_CONFIG['emb_dim'], num_classes)

In [128]:
NEW_CONFIG

{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': True}

In [169]:
for param in gpt.trf_blocks[-1].parameters():
  param.requires_grad = True

for param in gpt.final_norm.parameters():
  param.requires_grad = True

In [156]:
inputs = tokenizer.encode("Do you have time ?")
inputs = torch.tensor(inputs, dtype=torch.long).unsqueeze(0)

In [157]:
with torch.no_grad():
  outputs = gpt(inputs)

In [158]:
outputs.shape

torch.Size([1, 5, 2])

In [159]:
outputs[:, -1, :]

tensor([[0.0627, 0.8014]])

In [160]:
probas = torch.softmax(outputs[:, -1, :], dim=-1)
label = torch.argmax(probas)

In [161]:
label.item()

1

In [136]:
def cal_accuracy_loader(data_loader, model, device, num_batches=None):
  model.eval()
  correct_predictions, num_examples = 0, 0

  if num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))

  for i, (input_batch, target_batch) in enumerate(data_loader):
    if i >= num_batches:
      break

    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)

    with torch.no_grad():
      logits = model(input_batch)[:, -1, :]
    predicted_labels = torch.argmax(logits, dim=-1)

    num_examples += predicted_labels.shape[0]
    correct_predictions += (predicted_labels == target_batch).sum().item()

  accuracy = correct_predictions / num_examples
  return accuracy

In [181]:
torch.manual_seed(123)

train_accuracy = cal_accuracy_loader(train_loader, gpt, device, num_batches=10)
validation_accuracy = cal_accuracy_loader(validation_loader, gpt, device, num_batches=10)
test_accuracy = cal_accuracy_loader(test_loader, gpt, device, num_batches=10)

print(f'Train Accuracy: {train_accuracy * 100:.2f}')
print(f'Validation Accuracy: {validation_accuracy * 100:.2f}')
print(f'Test Accuracy: {test_accuracy * 100:.2f}')

Train Accuracy: 41.25
Validation Accuracy: 42.50
Test Accuracy: 51.25


In [139]:
def cal_loss_batch(input_batch, target_batch, model, device):
  input_batch = input_batch.to(device)
  target_batch = target_batch.to(device)
  logits = model(input_batch)[:, -1, :]
  loss = torch.nn.functional.cross_entropy(logits, target_batch)
  return loss

In [140]:
def cal_loss_loader(data_loader, model, device, num_batches=None):
  total_loss = 0

  if len(data_loader) == 0:
    return float('nan')

  elif num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))

  for i, (input_batch, target_batch) in enumerate(data_loader):
    if i >= num_batches:
      break

    loss = cal_loss_batch(input_batch, target_batch, model, device)
    total_loss += loss.item()

  return total_loss / num_batches

In [149]:
def train_classifier_simple(model, optimizer, train_loader,
                            validation_loader, num_epochs, device, eval_freq, eval_iter):

  train_losses, val_losses, train_accs, val_accs = [], [], [], []

  example_seen, global_step = 0, -1

  for epoch in range(num_epochs):
    print(f"Startig Epoch : {epoch}")
    model.train()

    for input_batch, target_batch in train_loader:
      optimizer.zero_grad()
      loss = cal_loss_batch(input_batch, target_batch, model, device)
      loss.backward()
      optimizer.step()
      example_seen += input_batch.shape[0]
      global_step += 1

      if global_step % eval_freq == 0:
        train_loss, val_loss = evaluate_model(
            model, train_loader, validation_loader, device, eval_iter
        )
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        print(f"Epoch {epoch + 1} (Step {global_step:6d})")
        print(f"Train Loss: {train_loss:.4f} | Validation Loss: {val_loss:.4f}")
    train_accuracy = cal_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
    val_accuracy = cal_accuracy_loader(validation_loader, model, device, num_batches=eval_iter)

    print(f"Train Accuracy: {train_accuracy * 100:.2f}", end=' ')
    print(f"Validation Accuracy: {val_accuracy * 100:.2f}")

    train_accs.append(train_accuracy)
    val_accs.append(val_accuracy)
  return train_losses, val_losses, train_accs, val_accs, example_seen

In [170]:
def evaluate_model(model, train_loader, validation_loader, device, eval_iter):
  model.eval()

  with torch.no_grad():
    train_loss = cal_loss_loader(train_loader, model, device, num_batches=eval_iter)
    val_loss = cal_loss_loader(validation_loader, model, device, num_batches=eval_iter)

  model.train()
  return train_loss, val_loss

In [182]:
import time

start_time = time.time()
torch.manual_seed(123)

optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3, weight_decay=0.1)
num_epochs = 1

train_losses, val_losses, train_accs, val_accs, example_seen = train_classifier_simple(
    gpt, optimizer, train_loader, validation_loader, num_epochs, device, eval_freq=50, eval_iter=5
)

end_time = time.time()
execution_time = (end_time - start_time) / 60
print(f"Execution time: {execution_time:.2f} minutes")

Startig Epoch : 0
Epoch 1 (Step      0)
Train Loss: 1.5416 | Validation Loss: 1.4197
Epoch 1 (Step     50)
Train Loss: 0.1179 | Validation Loss: 0.0562
Epoch 1 (Step    100)
Train Loss: 0.0005 | Validation Loss: 0.0002
Epoch 1 (Step    150)
Train Loss: 0.0000 | Validation Loss: 0.0001
Train Accuracy: 100.00 Validation Accuracy: 100.00
Execution time: 22.49 minutes


In [186]:
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
  model.eval()

  input_ids = tokenizer.encode(text)
  supported_context_length = model.pos_emb.weight.shape[0]

  input_ids = input_ids[:min(max_length, supported_context_length)]

  input_ids = input_ids + [pad_token_id] * (max_length - len(input_ids))
  input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)

  with torch.no_grad():
    logits = model(input_tensor)[:, -1, :]

  probas = torch.argmax(logits, dim=-1).item()
  return label.item()

In [184]:
text

"Is the following log owasp complient ? Answer with Yes and No:'[DEBUG] 2025-10-16 19:03:39 UserAPI - Unauthorized access attempt by userID=629 to resource /admin/config'"

In [187]:
classify_review(text, gpt, tokenizer, device, max_length=train_dataset.max_length)

1

In [188]:
classify_review("How are you?", gpt, tokenizer, device, max_length=train_dataset.max_length)

1

In [189]:
test_accuracy = cal_accuracy_loader(test_loader, gpt, device)
print(f'Test Accuracy: {test_accuracy * 100:.2f}')

Test Accuracy: 100.00
