<a href="https://colab.research.google.com/github/kiran3429/LLM_based-on_gpt_2/blob/main/llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn

In [None]:
config={
    "vacbsize":50257,
    "contlen":1024,##########it is 1024 but making 256 for ease for runnning
    "embdim":768,
    "nohead":12,
    "nolayers":12,
    "droprate":0.1,
    "qkv_bias":False
}

In [None]:
class Layernorm(nn.Module):
  def __init__(self,embdim):
    super().__init__()
    self.eps=1e-5
    self.scale=nn.Parameter(torch.ones(embdim))
    self.shift=nn.Parameter(torch.zeros(embdim))
  def forward(self,x):
    mean=x.mean(dim=-1,keepdim=True)
    var=x.var(dim=-1,keepdim=True,unbiased=False)
    normx=(x-mean)/torch.sqrt(var+self.eps)
    return self.scale * normx+self.shift


In [None]:
class gelu(nn.Module):
  def __init__(self):
    super().__init__()
  def forward(self,x):
    return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

In [None]:
class forwar(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.layers=nn.Sequential(nn.Linear(cfg["embdim"],4*cfg["embdim"]),
        gelu(),
        nn.Linear(4*cfg["embdim"],cfg["embdim"])
    )
  def forward(self,x):
    return self.layers(x)


In [None]:
class multiheadattention(nn.Module):
  def __init__(self,din,dout,conlen,nofhead,dprate,qkv_bias=False):
    super().__init__()
    self.dout=dout
    self.nufhead=nofhead
    self.head_dim=dout//nofhead
    self.W_query = nn.Linear(din, dout, bias=qkv_bias)
    self.W_key = nn.Linear(din, dout, bias=qkv_bias)
    self.W_value = nn.Linear(din, dout, bias=qkv_bias)
    self.out_proj = nn.Linear(dout, dout)
    self.dropout = nn.Dropout(dprate)
    self.register_buffer("mask",torch.triu(torch.ones(conlen, conlen),diagonal=1))
  def forward(self,x):
      b,numtoken,din=x.shape
      query=self.W_query(x)
      key=self.W_key(x)
      value=self.W_value(x)
      key= key.view(b, numtoken, self.nufhead, self.head_dim)
      value= value.view(b, numtoken, self.nufhead, self.head_dim)
      query= query.view(b, numtoken, self.nufhead, self.head_dim)
      key = key.transpose(1, 2)
      query = query.transpose(1, 2)
      value = value.transpose(1, 2)
      attn_scores = query @ key.transpose(2, 3)
      mask_bool = self.mask.bool()[:numtoken, :numtoken]
      attn_scores.masked_fill_(mask_bool, -torch.inf)
      attn_weights = torch.softmax(attn_scores / key.shape[-1]**0.5, dim=-1)
      attn_weights = self.dropout(attn_weights)
      context_vec = (attn_weights @ value).transpose(1, 2)
      context_vec = context_vec.contiguous().view(b, numtoken, self.dout)
      context_vec = self.out_proj(context_vec)
      return context_vec



In [None]:
class transformer(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.att=multiheadattention(
        din=cfg["embdim"],
        dout=cfg["embdim"],
        conlen=cfg["contlen"],
        nofhead=cfg["nohead"],
        dprate=cfg["droprate"],
        qkv_bias=cfg["qkv_bias"])
    self.ff=forwar(cfg)
    self.norm1=Layernorm(cfg["embdim"])
    self.norm2=Layernorm(cfg["embdim"])
    self.drop=nn.Dropout(cfg["droprate"])
  def forward(self,x):
    shortcut=x
    x=self.norm1(x)
    x=self.att(x)
    x=self.drop(x)
    x=x+shortcut

    shortcut=x
    x=self.norm2(x)
    x=self.ff(x)
    x=self.drop(x)
    x=x+shortcut
    return x






In [None]:
class kirllm(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.tokemd=nn.Embedding(cfg["vacbsize"],cfg["embdim"])
    self.posemd=nn.Embedding(cfg["contlen"],cfg["embdim"])
    self.drop=nn.Dropout(cfg["droprate"])
    self.tf=nn.Sequential(*[transformer(cfg) for _ in range(cfg["nolayers"])])
    self.norm=Layernorm(cfg["embdim"])
    self.head=nn.Linear(cfg["embdim"],cfg["vacbsize"],bias=False)

  def forward(self,indx):
    batchsize,sqlen=indx.shape
    tokemd=self.tokemd(indx)
    posemd=self.posemd(torch.arange(sqlen,device=indx.device))
    x=tokemd+posemd
    x=self.drop(x)
    x=self.tf(x)
    x=self.norm(x)
    logits=self.head(x)
    return logits










In [None]:
! pip install tiktoken


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/commands/install.py", line 362, in run
    resolver = self.make_resolver(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/req_command.py", line 177, in make_resolver
    return pip._internal.resolution.resolvelib.resolver.Resolver(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 58, in __init__
    self.factory = Factory(
                   ^^^^^^^^
  File "/usr/local/lib/py

In [None]:
import tiktoken
tknizer=tiktoken.get_encoding("gpt2")
batch=[]
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
txt3="cool is bad for"
batch.append(torch.tensor(tknizer.encode(txt1)))
batch.append(torch.tensor(tknizer.encode(txt2)))
batch.append(torch.tensor(tknizer.encode(txt3)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[ 6109,  3626,  6100,   345],
        [ 6109,  1110,  6622,   257],
        [24494,   318,  2089,   329]])


In [None]:
torch.manual_seed(1337)
m=kirllm(config)
logits=m(batch)
print(logits.shape)

torch.Size([3, 4, 50257])


In [None]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):

        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)

        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [None]:
start_context = "Hello, I am"
encoded = tknizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0) #A
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


In [None]:
torch.manual_seed(123)
model = kirllm(config)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[ 6109,  3626,  6100,   345],
        [ 6109,  1110,  6622,   257],
        [24494,   318,  2089,   329]])

Output shape: torch.Size([3, 4, 50257])
tensor([[[ 0.2551,  0.6409, -0.5117,  ...,  0.7818,  0.8821, -0.2491],
         [ 0.1425, -0.5497, -0.4706,  ..., -0.1258,  0.4164, -0.5564],
         [ 0.9396,  0.0197, -0.0166,  ...,  0.0430, -0.5293, -0.2677],
         [-0.9100,  0.3030, -0.0057,  ...,  0.5892,  0.2688, -0.2421]],

        [[ 0.2038,  0.1096, -0.1833,  ...,  0.2755,  0.1482, -0.6731],
         [-0.0055,  0.3732, -0.2718,  ...,  0.7349,  0.1742,  0.4797],
         [ 1.0394,  0.9303, -0.3502,  ...,  0.7602,  0.4402, -0.1752],
         [ 0.2076,  0.3561,  0.5209,  ...,  1.0054, -0.1290,  0.0395]],

        [[-0.5690, -0.4463,  0.3602,  ..., -0.3482,  0.2385, -0.3862],
         [ 1.0988, -0.6857, -0.7154,  ..., -0.1282,  0.3962, -0.4203],
         [ 0.8610,  0.5270,  0.1779,  ..., -0.5269,  0.1734, -0.5291],
         [ 0.5596, -0.5239,  0.1703,  ...,  1

In [None]:
model.eval() #A
out = generate_text_simple(
model=model,
idx=encoded_tensor,
max_new_tokens=6,
context_size=config["contlen"]
)
print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
Output length: 10


In [None]:
decoded_text = tknizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am Featureiman Byeswickattribute argue


Loads the data

In [None]:
import os
import urllib.request

file_path = "/content/drive/MyDrive/the-verdict.txt"

with open(file_path, "r", encoding="utf-8") as file:
  text_data = file.read()

In [None]:
import tiktoken
tock = tiktoken.get_encoding("gpt2")

In [None]:
total_characters = len(text_data)
total_tokens = len(tock.encode(text_data))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20479
Tokens: 5145


implement the data Loader for spLiting data etc

In [None]:
from torch.utils.data import Dataset, DataLoader
class GPTV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [None]:
# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=config["contlen"],
    stride=config["contlen"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=config["contlen"],
    stride=config["contlen"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [None]:
# Sanity check

if total_tokens * (train_ratio) < config["contlen"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < config["contlen"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")

In [None]:
print("Train loader:")
for x, y in train_loader:
    print(x.shape, y.shape)

print("\nValidation loader:")
for x, y in val_loader:
    print(x.shape, y.shape)

print(len(train_loader))
print(len(val_loader))


Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])
9
1


to check data Loaded perfectly

In [None]:
train_tokens = 0
for input_batch, target_batch in train_loader:
    train_tokens += input_batch.numel()

val_tokens = 0
for input_batch, target_batch in val_loader:
    val_tokens += input_batch.numel()

print("Training tokens:", train_tokens)
print("Validation tokens:", val_tokens)
print("All tokens:", train_tokens + val_tokens)

Training tokens: 4608
Validation tokens: 512
All tokens: 5120


Loss function calculation

In [None]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [None]:
torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader
device = torch.device("cpu")
with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 10.987583372328016
Validation loss: 10.98110580444336


this return train Loss and validation Loss

In [None]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

THis return the noof comtents after every epoch

In [None]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.posemd.weight.shape[0]
    encoded = tokenizer.encode(start_context) # Use tokenizer.encode()
    encoded_tensor = torch.tensor(encoded).unsqueeze(0).to(device) # Convert to tensor and add batch dimension
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded_tensor, # Pass the tensor
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = tokenizer.decode(token_ids.squeeze(0).tolist()) # Use tokenizer.decode() and convert back to list
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [None]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel() # Returns the total number of elements (or tokens) in the input_batch.
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen

In [None]:
import time
start_time = time.time()

torch.manual_seed(123)
model = kirllm(config)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves you", tokenizer=tknizer
)

# Note:
# Uncomment the following code to show the execution time
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 9.781, Val loss 9.933
Ep 1 (Step 000005): Train loss 8.111, Val loss 8.339
Every effort moves you,,,,,,,,,,,,.                                     
Ep 2 (Step 000010): Train loss 6.661, Val loss 7.048
Ep 2 (Step 000015): Train loss 5.961, Val loss 6.616
Every effort moves you, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and,, and, and,
Ep 3 (Step 000020): Train loss 5.726, Val loss 6.600
Ep 3 (Step 000025): Train loss 5.201, Val loss 6.348
Every effort moves you, and I had been.                                            
Ep 4 (Step 000030): Train loss 4.417, Val loss 6.278
Ep 4 (Step 000035): Train loss 4.069, Val loss 6.226
Every effort moves you know the                          "I he had the donkey and I had the and I had the donkey and down the room, I had
Ep 5 (Step 000040): Train loss 3.732, Val loss 6.160
Every effort moves you know it was not that the picture--I had the fact by the last

In [None]:
model.eval()

kirllm(
  (tokemd): Embedding(50257, 768)
  (posemd): Embedding(256, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (tf): Sequential(
    (0): transformer(
      (att): multiheadattention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): forwar(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): gelu()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): Layernorm()
      (norm2): Layernorm()
      (drop): Dropout(p=0.1, inplace=False)
    )
    (1): transformer(
      (att): multiheadattention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
    

In [None]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [None]:
torch.manual_seed(123)

token_ids = generate(
    model=model,
    idx=torch.tensor(tknizer.encode("Every effort moves you")).unsqueeze(0), # Encode the text and convert to tensor
    max_new_tokens=15,
    context_size=config["contlen"],
    top_k=25,
    temperature=1.4
)

print("Output text:\n", tknizer.decode(token_ids.squeeze(0).tolist())) # Decode the generated token IDs

Output text:
 Every effort moves you stand to work on surprise, a one of us had gone with random-


In [None]:
model = kirllm(config)

NameError: name 'kirllm' is not defined

In [None]:
import torch
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    },
    "/content/drive/MyDrive/model.pth"
)

NameError: name 'model' is not defined

In [None]:
import torch
checkpoint = torch.load("/content/drive/MyDrive/model.pth")
model = kirllm(config)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train();

In [None]:
pip install tensorflow>=2.15.0 tqdm>=4.66

In [None]:
import tensorflow as tf
import tqdm

print("TensorFlow version:", tf.__version__)
print("tqdm version:", tqdm.__version__)

TensorFlow version: 2.19.0
tqdm version: 4.67.1


In [None]:
%%writefile gpt_download3.py
import os
import requests
import json
import numpy as np
import tensorflow as tf
from tqdm import tqdm

def download_file(url, destination):
    try:
        response = requests.get(url, stream=True, verify=False)
        file_size = int(response.headers.get("content-length", 0))
        if os.path.exists(destination) and os.path.getsize(destination) == file_size:
            print(f"✅ File already exists: {destination}")
            return
        with tqdm(total=file_size, unit="iB", unit_scale=True, desc=os.path.basename(url)) as pbar:
            with open(destination, "wb") as f:
                for chunk in response.iter_content(1024):
                    pbar.update(len(chunk))
                    f.write(chunk)
    except requests.exceptions.RequestException as e:
        print(f"⚠️ Error downloading {url}: {e}")

def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}
    for name, _ in tf.train.list_variables(ckpt_path):
        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))
        variable_name_parts = name.split("/")[1:]
        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]
        for key in variable_name_parts[1:-1]:
            target_dict = target_dict.setdefault(key, {})
        last_key = variable_name_parts[-1]
        target_dict[last_key] = variable_array
    return params

def download_and_load_gpt2(model_size, models_dir):
    allowed_sizes = ("124M", "355M", "774M", "1558M")
    if model_size not in allowed_sizes:
        raise ValueError(f"❌ Model size must be one of {allowed_sizes}")
    model_dir = os.path.join(models_dir, model_size)
    base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
    filenames = [
        "checkpoint", "encoder.json", "hparams.json",
        "model.ckpt.data-00000-of-00001", "model.ckpt.index",
        "model.ckpt.meta", "vocab.bpe"
    ]
    os.makedirs(model_dir, exist_ok=True)
    for filename in filenames:
        file_url = f"{base_url}/{model_size}/{filename}"
        file_path = os.path.join(model_dir, filename)
        download_file(file_url, file_path)
    print(f"\n✅ All files for {model_size} downloaded successfully!")
    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    settings = json.load(open(os.path.join(model_dir, "hparams.json")))
    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)
    print(f"✅ Loaded GPT-2 ({model_size}) parameters successfully.")
    return settings, params


Overwriting gpt_download3.py


In [None]:
from gpt_download3 import download_and_load_gpt2

In [None]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")



✅ File already exists: gpt2/124M/checkpoint




✅ File already exists: gpt2/124M/encoder.json




✅ File already exists: gpt2/124M/hparams.json




✅ File already exists: gpt2/124M/model.ckpt.data-00000-of-00001




✅ File already exists: gpt2/124M/model.ckpt.index




✅ File already exists: gpt2/124M/model.ckpt.meta




✅ File already exists: gpt2/124M/vocab.bpe

✅ All files for 124M downloaded successfully!
✅ Loaded GPT-2 (124M) parameters successfully.


In [None]:
print("Settings:", settings)
print("Parameter dictionary keys:", params.keys())

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [None]:
print(params["wte"])
print("Token embedding weight tensor dimensions:", params["wte"].shape)


[[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.09783269
  -0.06952604]
 [ 0.1860082   0.01665728  0.04611587 ... -0.09625227  0.07847701
  -0.02245961]
 [ 0.05135201 -0.02768905  0.0499369  ...  0.00704835  0.15519823
   0.12067825]]
Token embedding weight tensor dimensions: (50257, 768)


In [None]:
# Define model configurations in a dictionary for compactness
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Copy the base configuration and update with specific model settings
model_name = "gpt2-small (124M)"  # Example model name
NEW_CONFIG = config.copy()
NEW_CONFIG.update(model_configs[model_name])

In [None]:
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})
gpt = kirllm(NEW_CONFIG)
gpt.eval();

In [None]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [None]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.posemd.weight = assign(gpt.posemd.weight, params['wpe'])
    gpt.tokemd.weight = assign(gpt.tokemd.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.tf[b].att.W_query.weight = assign(
            gpt.tf[b].att.W_query.weight, q_w.T)
        gpt.tf[b].att.W_key.weight = assign(
            gpt.tf[b].att.W_key.weight, k_w.T)
        gpt.tf[b].att.W_value.weight = assign(
            gpt.tf[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.tf[b].att.W_query.bias = assign(
            gpt.tf[b].att.W_query.bias, q_b)
        gpt.tf[b].att.W_key.bias = assign(
            gpt.tf[b].att.W_key.bias, k_b)
        gpt.tf[b].att.W_value.bias = assign(
            gpt.tf[b].att.W_value.bias, v_b)

        gpt.tf[b].att.out_proj.weight = assign(
            gpt.tf[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.tf[b].att.out_proj.bias = assign(
            gpt.tf[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.tf[b].ff.layers[0].weight = assign(
            gpt.tf[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.tf[b].ff.layers[0].bias = assign(
            gpt.tf[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.tf[b].ff.layers[2].weight = assign(
            gpt.tf[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.tf[b].ff.layers[2].bias = assign(
            gpt.tf[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.tf[b].norm1.scale = assign(
            gpt.tf[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.tf[b].norm1.shift = assign(
            gpt.tf[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.tf[b].norm2.scale = assign(
            gpt.tf[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.tf[b].norm2.shift = assign(
            gpt.tf[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    gpt.norm.scale = assign(gpt.norm.scale, params["g"])
    gpt.norm.shift = assign(gpt.norm.shift, params["b"])
    gpt.head.weight = assign(gpt.head.weight, params["wte"])

In [None]:
import torch

# Automatically selects GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [None]:
load_weights_into_gpt(gpt, params)
gpt.to(device);

In [None]:
torch.manual_seed(123)

token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tknizer).to(device),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.5
)

print("Output text:\n", token_ids_to_text(token_ids, tknizer))

Output text:
 Every effort moves you toward finding an ideal new way to practice something!

What makes us want to be on top of that?


