In [None]:
from importlib.metadata import version
pkgs = [
    "matplotlib",
    "numpy",
    "tiktoken"
]
for pkg in pkgs:
    print(f"{pkg}: {version(pkg)}")

In [None]:
from gpt_download import BASE_CONFIG, model_configs
from gpt_model import GPTModel
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(123)

In [None]:
config = BASE_CONFIG.copy()
config.update(model_configs["gpt2-small (124M)"])
model = GPTModel(config)
model.eval();

In [None]:
import tiktoken
from gpt_model import generate_text_simple, token_ids_to_text, text_to_token_ids

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
ids = tokenizer.encode("How are you doing today?")
print(ids)

In [None]:
token_ids = generate_text_simple(
    model,
    text_to_token_ids("Every effort moves you", tokenizer),
    20,
    context_size=config['context_length']
)
token_ids_to_text(token_ids, tokenizer)

In [None]:
texts = [
    "every effort moves you",
    "I really like chocolate",
]
inputs = []
targets = []

for text in texts:
    ids = text_to_token_ids(text, tokenizer).squeeze(0)
    inputs.append(ids[:-1])
    targets.append(ids[1:])

inputs = torch.stack(inputs, dim=0)
targets = torch.stack(targets, dim=0)
print(inputs)
print(targets, targets.shape)

In [None]:
with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits, dim=-1)
probas.shape

In [None]:
predicted_tokens = torch.argmax(probas, dim=-1, keepdim=True)
predicted_tokens.shape
predicted_tokens[0].flatten().shape

In [None]:
token_ids_to_text(targets[0], tokenizer)
targets[0].shape

In [None]:
token_ids_to_text(predicted_tokens[0].flatten(), tokenizer)

In [None]:
probas.shape

In [None]:
probs = []
for text_idx in [0, 1]:
    probs.append(probas[text_idx, [0, 1, 2], targets[text_idx]])

In [None]:
probs_1 = torch.log(torch.cat(probs, dim=0))
probs_1

In [None]:
avg_log_probs = torch.mean(probs_1)
print(avg_log_probs)

In [None]:
net_avg_log_probas = avg_log_probs * -1
print(net_avg_log_probas)

In [None]:
loss = F.cross_entropy(logits.view(-1, 50257), targets.flatten())
loss

In [None]:
perplexity = torch.exp(loss)
perplexity

In [None]:
import os
import requests

In [None]:
file_path = "/home/htkumar/llms/rasbt_llms_from_scratch/the-verdict.txt"
# url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

# # Download the dataset if it doesn't already exist
# if not os.path.exists(file_path):
#     response = requests.get(url)
#     with open(file_path, 'wb') as f:
#         f.write(response.content)

# Read the dataset
with open(file_path, 'r') as f:
    text_data = f.read()

In [None]:
print(text_data[:99])

In [None]:
len(text_data), len(tokenizer.encode(text_data))

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
from gpt_dataloader import GPTDatasetV1, create_dataloader_v1

In [None]:
# train/validation ratio

train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

torch.manual_seed(123)
train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=256,
    shuffle=True,
    drop_last=True,
)
val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=256,
    shuffle=False,
    drop_last=False,
)

In [None]:
dataset = GPTDatasetV1(train_data, tokenizer)
dataset[0]

In [None]:
for x, y in train_loader:
    print(x.shape, y.shape)
    print(x.numel())
    break

In [None]:
for x, y in val_loader:
    print(x.shape, y.shape)

In [None]:
x.size(-1)

In [None]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    with torch.no_grad():
        logits = model(input_batch)
    loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_batch.flatten())
    return loss

In [None]:
from gpt_model import calc_loss_loader, train_model_simple

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

torch.manual_seed(123)
train_loss = calc_loss_loader(train_loader, model, device)
val_loss = calc_loss_loader(val_loader, model, device)
print(train_loss, val_loss)

In [None]:
model = GPTModel(config)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

In [None]:
num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves you", tokenizer=tokenizer, max_new_tokens=10
)

In [None]:
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
epochs_tensor

In [None]:
from gpt_model import plot_losses

In [None]:
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

In [None]:
model.eval()
tokenizer = tiktoken.get_encoding("gpt2")
idx = text_to_token_ids("Every effort moves you", tokenizer).to(device)

token_ids = generate_text_simple(
    model=model,
    idx=idx,
    max_new_tokens=25,
    context_size=config['context_length']
)

print(f"Output text is {token_ids_to_text(token_ids, tokenizer)}\n")


In [None]:
vocab = {
    "closer": 0,
    "every": 1,
    "effort": 2,
    "forward": 3,
    "inches": 4,
    "moves": 5,
    "pizza": 6,
    "toward": 7,
    "you": 8,
}
inverse_vocab = {v: k for k, v in vocab.items()}

next_token_logits = torch.tensor(
    [4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
)
next_token_probs = torch.softmax(next_token_logits, dim=0)
next_token = torch.multinomial(next_token_probs, 1)
next_token

In [None]:
# torch.bincount??

In [None]:
a = torch.tensor([0, 0, 0, 2, 2, 0, 1])
torch.bincount(a)

In [None]:
def print_sampled_tokens(probas):
    torch.manual_seed(123)
    samples = [torch.multinomial(probas, 1).item() for i in range(1000)]
    counts = torch.bincount(torch.tensor(samples))
    for i, c in enumerate(counts):
        print(f"{i} ... {c} {inverse_vocab[i]}")

In [None]:
print_sampled_tokens(next_token_probs)

In [None]:
def softmax_with_temperature(logits, temperature):
    return torch.softmax(logits / temperature, dim=-1)

temperatures = [1, 0.1, 5, 10]
scaled_probas = [softmax_with_temperature(next_token_logits, T) for T in temperatures]
scaled_probas

In [None]:
import matplotlib.pyplot as plt

In [None]:
# plotting
x = torch.arange(len(vocab))
bar_width = 0.15

fig, ax = plt.subplots(figsize=(5, 3))
for i, t in enumerate(temperatures):
    rects = ax.bar(x + i * bar_width, scaled_probas[i], bar_width, label=f"Temperature = {t}")

ax.set_ylabel("Probability")
ax.set_xticks(x)
ax.set_xticklabels(vocab.keys(), rotation=90)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
print_sampled_tokens(scaled_probas[1])

In [None]:
print_sampled_tokens(scaled_probas[2])

In [None]:
print_sampled_tokens(scaled_probas[3])

In [None]:
next_token_logits

In [None]:
def generate(
    model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None
):

    # TODO: Verify that this generates the same outputs as in master repo
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)

        # shape is (B, V)
        logits = logits[:, -1, :]

        if top_k is not None:
            topk_probs, _ = torch.topk(logits, top_k)
            min_val = topk_probs[:, -1]
            logits = torch.where(
                logits < min_val,
                torch.tensor(-float("inf")).to(logits.device),
                logits,
            )

        if temperature > 0.0:
            logits = logits / temperature

            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            idx_next = torch.argmax(logits, dim=1, keepdim=True)  # (batch, 1)

        if idx_next == eos_id:
            break

        idx = torch.cat((idx, idx_next), dim=1)  # (B, T + 1)

    return idx

In [None]:
token_ids = generate(
    model=model,
    idx=idx,
    max_new_tokens=25,
    context_size=config['context_length'],
    temperature=5,
    top_k=15,
)

print(f"Output text is {token_ids_to_text(token_ids, tokenizer)}\n")

In [None]:
model.state_dict().keys()

In [None]:
model_dir = "/home/htkumar/llms/rasbt_llms_from_scratch"

In [None]:
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict()
    },
    f"{model_dir}/model_and_optimizer.pth"
)

In [None]:
checkpoint = torch.load(f"{model_dir}/model_and_optimizer.pth", weights_only=True)
model_new = GPTModel(config)
model_new.load_state_dict(checkpoint['model_state_dict'])

optimizer = torch.optim.AdamW(model_new.parameters(), lr=0.0005, weight_decay=0.1)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
model_new.train();

In [None]:
torch.manual_seed(123)
model_new.to(device)
token_ids = generate(
    model=model_new,
    idx=idx,
    max_new_tokens=25,
    context_size=config['context_length'],
    temperature=5,
    top_k=15,
)

print(f"Output text is {token_ids_to_text(token_ids, tokenizer)}\n")

In [None]:
from gpt_download import download_and_load_gpt2, load_weights_into_gpt, load_gpt2

In [None]:
settings, params = load_gpt2(f"{model_dir}/gpt2/124M")

In [None]:
config

In [None]:
gpt = GPTModel(config)
gpt.eval();

In [None]:
load_weights_into_gpt(gpt, params)
gpt.to(device)

In [None]:
torch.manual_seed(123)

idx = text_to_token_ids("Every effort moves you", tokenizer).to(device)
token_ids = generate(
    model=gpt,
    idx=idx,
    max_new_tokens=50,
    context_size=config['context_length'],
    temperature=5,
    top_k=15,
)

print(f"Output text is {token_ids_to_text(token_ids, tokenizer)}\n")

In [None]:
torch.manual_seed(123)

idx = text_to_token_ids("Every effort moves you", tokenizer).to(device)
token_ids = generate(
    model=model_new,
    idx=idx,
    max_new_tokens=50,
    context_size=config['context_length'],
    temperature=5,
    top_k=15,
)

print(f"Output text is {token_ids_to_text(token_ids, tokenizer)}\n")