In [35]:
from importlib.metadata import version

pkgs = ["matplotlib", "numpy", "tiktoken", "torch", "tensorflow", "pandas"]

for p in pkgs:
    print(f"{p} version: {version(p)}")

matplotlib version: 3.7.2
numpy version: 1.24.3
tiktoken version: 0.6.0
torch version: 2.0.1
tensorflow version: 2.13.0
pandas version: 2.0.3


### Dataset prep

In [36]:
import os
import urllib.request
import zipfile
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
data_file_path

PosixPath('sms_spam_collection/SMSSpamCollection.tsv')

In [37]:
def download_and_unzip(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists, no need to download")
        return

    # download the file
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # unzip the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # add .tsv extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

In [38]:
download_and_unzip(url, zip_path, extracted_path, data_file_path)

sms_spam_collection/SMSSpamCollection.tsv already exists, no need to download


In [39]:
import pandas as pd

df = pd.read_csv(data_file_path, sep="\t", header=None, names=["label", "text"])

In [40]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [41]:
df.shape

(5572, 2)

In [42]:
df["label"].value_counts()
# unbalanced dataset

label
ham     4825
spam     747
Name: count, dtype: int64

In [43]:
df[df.label == "spam"].shape

(747, 2)

In [44]:
def create_balanced_dataset(df):
    num_spam = df[df.label == "spam"].shape[0]
    ham_subset = df[df.label == "ham"].sample(num_spam, random_state=123)
    balanced_df = pd.concat([ham_subset, df[df.label == "spam"]])
    return balanced_df

In [45]:
balanced_df = create_balanced_dataset(df)
balanced_df["label"].value_counts()

label
ham     747
spam    747
Name: count, dtype: int64

In [46]:
balanced_df.shape

(1494, 2)

In [47]:
balanced_df["label"] = balanced_df["label"].map({"ham": 0, "spam": 1})

In [48]:
balanced_df[balanced_df.label == 1]

Unnamed: 0,label,text
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
5,1,FreeMsg Hey there darling it's been 3 week's n...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...
11,1,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,1,Want explicit SEX in 30 secs? Ring 02073162414...
5540,1,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,1,Had your contract mobile 11 Mnths? Latest Moto...
5566,1,REMINDER FROM O2: To get 2.50 pounds free call...


In [49]:
balanced_df.sample(frac=1).reset_index()

Unnamed: 0,index,label,text
0,1340,0,Every monday..nxt week vl be completing..
1,5164,1,Congrats 2 mobile 3G Videophones R yours. call...
2,708,0,Quite late lar... Ard 12 anyway i wun b drivin...
3,3266,1,"44 7732584351, Do you want a New Nokia 3510i c..."
4,929,0,Oh all have to come ah?
...,...,...,...
1489,754,0,When did you get to the library
1490,3174,1,"Dear Voucher Holder, To claim this weeks offer..."
1491,3921,1,FREE>Ringtone! Reply REAL or POLY eg REAL1 1. ...
1492,3763,1,FREE for 1st week! No1 Nokia tone 4 ur mob eve...


In [50]:
balanced_df.columns

Index(['label', 'text'], dtype='object')

In [51]:
def random_split(df, train_frac, validation_frac):
    # shuffle dataframe
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)
    train_df = df[:train_end]
    valid_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, valid_df, test_df

In [52]:
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
train_df.shape, validation_df.shape, test_df.shape

((1045, 2), (149, 2), (300, 2))

In [53]:
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

### Creating data loaders

In [54]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})

[50256]

In [55]:
tokenizer.encode("How are you doing")

[2437, 389, 345, 1804]

In [56]:
# tokenizer.encode??

In [57]:
import torch
from torch.utils.data import Dataset


class SpamDataset(Dataset):
    def __init__(
        self, csv_file, tokenizer, max_length=None, pad_token_id=50256
    ) -> None:
        self.data = pd.read_csv(csv_file)

        # pre-tokenizer texts
        self.encoded_texts = [tokenizer.encode(text) for text in self.data["text"]]
        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            # truncate texts if they are longer than max length
            self.encoded_texts = [
                encoded_text[: self.max_length] for encoded_text in self.encoded_texts
            ]

        # Add padding
        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded_text = self.encoded_texts[index]
        label = self.data.iloc[index]["label"]
        return (
            torch.tensor(encoded_text, dtype=torch.long),
            torch.tensor(label, dtype=torch.long),
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            if len(encoded_text) > max_length:
                max_length = len(encoded_text)
        return max_length

In [58]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25, 30, 35, 40],
    "City": ["New York", "Los Angeles", "Chicago", "Houston"],
}

df = pd.DataFrame(data)
df.iloc[0]["Name"]

'Alice'

In [59]:
train_dataset = SpamDataset(csv_file="train.csv", max_length=None, tokenizer=tokenizer)
print(train_dataset.max_length)

120


In [60]:
validation_dataset = SpamDataset(
    csv_file="validation.csv", max_length=train_dataset.max_length, tokenizer=tokenizer
)
test_dataset = SpamDataset(
    csv_file="test.csv", max_length=train_dataset.max_length, tokenizer=tokenizer
)

In [61]:
validation_dataset.max_length, test_dataset.max_length

(120, 120)

In [62]:
# create dataloaders
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=validation_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [63]:
# DataLoader??

In [64]:
for text, label in test_loader:
    print(text.shape, label.shape)
    break

torch.Size([8, 120]) torch.Size([8])


In [65]:
len(train_loader), len(val_loader), len(test_loader)

(130, 19, 38)

In [17]:
import json
import os

import numpy as np
import requests
import tensorflow as tf
from tqdm import tqdm

In [18]:
base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
filename = "checkpoint"
model_dir = "downloaded_models"
model_size = "124M"

file_url = os.path.join(base_url, model_size, filename)
file_path = os.path.join(model_dir, filename)
file_url, file_path

('https://openaipublic.blob.core.windows.net/gpt-2/models/124M/checkpoint',
 'downloaded_models/checkpoint')

In [19]:
os.makedirs(model_dir, exist_ok=True)

In [20]:
response = requests.get(file_url, stream=True)
response.headers

{'Content-Length': '77', 'Content-Type': 'application/octet-stream', 'Content-MD5': 'ygNo/NPEwamaykJRHQwfEg==', 'Last-Modified': 'Wed, 02 Dec 2020 17:33:04 GMT', 'ETag': '0x8D896E85318642F', 'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0', 'x-ms-request-id': '86baa400-101e-000d-3151-c2c15b000000', 'x-ms-version': '2009-09-19', 'x-ms-meta-Mtime': '2019-09-17T04:52:11.325000000Z', 'x-ms-lease-status': 'unlocked', 'x-ms-blob-type': 'BlockBlob', 'Date': 'Wed, 19 Jun 2024 14:00:14 GMT'}

In [21]:
file_size = int(response.headers.get("content-length", 0))
block_size = 1024
progress_bar_description = url.split("/")[-1]
with tqdm(
    total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description
) as progress_bar:
    with open(file_path, "wb") as file:
        for chunk in response.iter_content(block_size):
            progress_bar.update(len(chunk))
            file.write(chunk)

sms+spam+collection.zip: 100%|██████████| 77.0/77.0 [00:00<00:00, 55.9kiB/s]


In [22]:
tk_ckpt_path = tf.train.latest_checkpoint(model_dir)
tk_ckpt_path

ERROR:tensorflow:Couldn't match files for checkpoint downloaded_models/model.ckpt


In [23]:
def download_file(file_path, destination):
    response = requests.get(file_path, stream=True)
    file_size = int(response.headers.get("content-length", 0))
    if os.path.exists(destination):
        file_size_local = os.path.getsize(destination)
        if file_size == file_size_local:
            print(f"File exists: {destination}")
            return

    block_size = 1024
    progress_bar_description = file_path.split("/")[-1]
    with tqdm(
        total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description
    ) as progress_bar:
        with open(destination, "wb") as file:
            for chunk in response.iter_content(block_size):
                progress_bar.update(len(chunk))
                file.write(chunk)

In [24]:
def download_and_load_gpt2(model_size, models_dir):
    # model sizes available in https://github.com/openai/gpt-2/blob/master/DEVELOPERS.md
    allowed_sizes = ("124M", "355M", "774M", "1558M")
    if model_size not in allowed_sizes:
        raise ValueError(f"Model size {model_size} not in {allowed_sizes}")

    # define paths
    model_dir = os.path.join(models_dir, model_size)
    base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
    filenames = [
        "checkpoint",
        "encoder.json",
        "hparams.json",
        "model.ckpt.data-00000-of-00001",
        "model.ckpt.index",
        "model.ckpt.meta",
        "vocab.bpe",
    ]

    # download files
    os.makedirs(model_dir, exist_ok=True)
    for filename in filenames:
        file_url = os.path.join(base_url, model_size, filename)
        file_path = os.path.join(model_dir, filename)
        print(file_url, file_path)
        download_file(file_url, file_path)

    # Add tf loading part

In [25]:
download_and_load_gpt2(model_size="124M", models_dir="gpt2")

https://openaipublic.blob.core.windows.net/gpt-2/models/124M/checkpoint gpt2/124M/checkpoint
File exists: gpt2/124M/checkpoint
https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json gpt2/124M/encoder.json
File exists: gpt2/124M/encoder.json
https://openaipublic.blob.core.windows.net/gpt-2/models/124M/hparams.json gpt2/124M/hparams.json
File exists: gpt2/124M/hparams.json
https://openaipublic.blob.core.windows.net/gpt-2/models/124M/model.ckpt.data-00000-of-00001 gpt2/124M/model.ckpt.data-00000-of-00001
File exists: gpt2/124M/model.ckpt.data-00000-of-00001
https://openaipublic.blob.core.windows.net/gpt-2/models/124M/model.ckpt.index gpt2/124M/model.ckpt.index
File exists: gpt2/124M/model.ckpt.index
https://openaipublic.blob.core.windows.net/gpt-2/models/124M/model.ckpt.meta gpt2/124M/model.ckpt.meta
File exists: gpt2/124M/model.ckpt.meta
https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe gpt2/124M/vocab.bpe
File exists: gpt2/124M/vocab.bpe


In [66]:
model_dir = os.path.join("gpt2", "124M")

In [67]:
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
tf_ckpt_path

'gpt2/124M/model.ckpt'

In [68]:
settings = json.load(open(os.path.join(model_dir, "hparams.json")))
settings

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

In [69]:
variables = tf.train.list_variables(tf_ckpt_path)

In [None]:
name, _ = variables[1]

In [None]:
name

In [None]:
tf.train.load_variable(tf_ckpt_path, name).shape

In [70]:
def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
    # Initialize parameters dictionary with empty blocks for each layer
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}

    # Iterate over each variable in the checkpoint
    for name, _ in tf.train.list_variables(ckpt_path):
        # Load the variable and remove singleton dimensions
        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))

        # Process the variable name to extract relevant parts
        variable_name_parts = name.split("/")[1:]  # Skip the 'model/' prefix

        # Identify the target dictionary for the variable
        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]

        # Recursively access or create nested dictionaries
        for key in variable_name_parts[1:-1]:
            target_dict = target_dict.setdefault(key, {})

        # Assign the variable array to the last key
        last_key = variable_name_parts[-1]
        target_dict[last_key] = variable_array

    return params

In [71]:
a = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)
a.keys()

dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])

In [72]:
a["wte"].shape

(50257, 768)

In [73]:
len(a["blocks"])

12

In [75]:
a["g"].shape

(768,)

In [None]:
a["wpe"].shape

In [None]:
a["blocks"][0].keys()

In [None]:
# tf.train.latest_checkpoint??

In [None]:
from utils.gpt_download import download_and_load_gpt2

In [None]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")

In [None]:
x = torch.randn(8, 128, 128)
x.shape

In [None]:
x.mean(dim=-1, keepdim=True).shape

In [None]:
# x.var??

In [None]:
import torch
import torch.nn as nn

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [None]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return (
            0.5
            * x
            * (
                1
                + torch.tanh(
                    torch.sqrt(torch.tensor(2.0 / torch.pi))
                    * (x + 0.044715 * torch.pow(x, 3))
                )
            )
        )

In [None]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [None]:
tensor1 = torch.randn(3)
tensor1.shape
tensor2 = torch.randn(3)
torch.matmul(tensor1, tensor2).size()

In [None]:
tensor1 = torch.randn(10, 3, 4)
tensor2 = torch.randn(4)
torch.matmul(tensor1, tensor2).size()

In [None]:
tensor1 = torch.randn(10, 3, 4)
tensor2 = torch.randn(10, 4, 5)
torch.matmul(tensor1, tensor2).size()

In [None]:
tensor1 = torch.randn(10, 3, 4)
tensor2 = torch.randn(4)
torch.matmul(tensor1, tensor2).size()

In [None]:
tensor1 = torch.randn(10, 4, 3, 4)
tensor2 = torch.randn(10, 4, 4, 5)
(tensor1 @ tensor2).size()

In [None]:
mask = torch.triu(torch.ones(10, 10), diagonal=1).bool()[:5, :5]
mask

In [None]:
attn_scores = torch.randn(1, 1, 5, 5)
attn_scores

In [None]:
attn_scores.masked_fill_(mask, -torch.inf)

In [None]:
mask_l = torch.tril(torch.ones(10, 10))[:5, :5]
mask_l

In [None]:
attn_scores_l = torch.randn(1, 1, 5, 5)
attn_scores_l.masked_fill_(mask_l == 0, -torch.inf)

In [None]:
torch.softmax??

In [None]:
a = torch.softmax(attn_scores_l, dim=-1)
a
dropout = nn.Dropout(0.3)
print(a, dropout(a))

In [None]:
0.6316 * (1.0 / 0.7)

In [None]:
a.shape[-1]

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by n_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask", torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        # transpose (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # dot product for each head
        attn_scores = torch.matmul(
            queries, keys.transpose(2, 3)
        )  # (b, num_heads, num_tokens, num_tokens) # double check this
        mask = self.mask[:num_tokens, :num_tokens].bool()
        attn_scores.masked_fill_(mask, -torch.inf)

        scaled_attn_scores = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)
        scaled_attn_scores = self.dropout(scaled_attn_scores)

        # (b, num_heads, num_tokens, num_tokens) *
        # (b, num_heads, num_tokens, head_dim) -> (b, num_heads, num_tokens, head_dim) -> (b, num_tokens, num_heads, head_dim)
        context_vec = (scaled_attn_scores @ values).transpose(1, 2)
        context_vec = context_vec.reshape(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec

In [None]:
class SublayerConnection(nn.Module):
    """
    Apply LN and residual connection.
    """

    def __init__(self, size, dropout):
        super().__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"],
        )
        self.ff = FeedForward(cfg)
        # self.norm1 = LayerNorm(cfg['emb_dim'])
        # self.norm2 = LayerNorm(cfg['emb_dim'])
        # self.drop_resid = nn.Dropout(cfg['drop_rate'])
        self.sublayer1 = SublayerConnection(cfg["emb_dim"], cfg["drop_rate"])
        self.sublayer2 = SublayerConnection(cfg["emb_dim"], cfg["drop_rate"])

    def forward(self, x):
        # might have some interesting consequences when we load weights
        # attention block
        x = self.sublayer1(x, self.att)
        # FF block
        x = self.sublayer2(x, self.ff)
        return x

In [None]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_ln = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # [batch_size, seq_len, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_ln(x)
        logits = self.out_head(x)
        return logits

In [None]:
l = [1, 2, 3, 4]


def f(a, b, c, d):
    print(a, b, c, d)


f(*l)

In [None]:
pos_emb = nn.Embedding(10, 12)
pos_emb
a = torch.arange(10)
a
pos_emb(a).shape, a.shape

In [None]:
a = torch.randn(3, 10)
print(a[:, -1])
print(a)

In [None]:
# Util functions
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (B, T) array of indices in current context

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]

        with torch.no_grad():
            logits = model(idx_cond)  # (B, T, vocab_size)

        logits = logits[:, -1, :]  # (batch, vocab_size)
        idx_next = torch.argmax(logits, dim=1, keepdim=True)  # (batch, 1)

        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [None]:
a = torch.randn(10, 4)
b = torch.randn(10, 1)
c = torch.cat((a, b), dim=-1)
c.shape

In [None]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch, Left: {left.shape}, right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [None]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded = torch.tensor(encoded).unsqueeze(0)
    return encoded

In [None]:
def token_ids_to_text(token_ids, tokenizer):
    tokens = token_ids.squeeze(0)
    return tokenizer.decode(tokens.tolist())

In [None]:
settings

In [None]:
# params

In [None]:
params["wpe"].shape

In [None]:
params["wte"].shape

In [None]:
params.keys()

In [None]:
params["blocks"][0]["attn"]["c_attn"]["w"].shape

In [None]:
a = params["blocks"][0]["attn"]["c_attn"]["w"]
q, k, v = np.split(a, 3, axis=-1)
q.shape, k.shape, v.shape

In [None]:
params["blocks"][0]["attn"]["c_attn"]["b"].shape

In [None]:
a = params["blocks"][0]["attn"]["c_attn"]["b"]
q, k, v = np.split(a, 3, axis=0)
q.shape, k.shape, v.shape

In [None]:
params["blocks"][0]["attn"].keys()

In [None]:
params["blocks"][0]["attn"]["c_proj"]["b"].shape

In [None]:
params["blocks"][0].keys()

In [None]:
params["blocks"][0]["mlp"]["c_fc"]["w"].shape, params["blocks"][0]["mlp"]["c_fc"][
    "b"
].shape

In [None]:
params["blocks"][0]["mlp"]["c_proj"]["w"].shape, params["blocks"][0]["mlp"]["c_proj"][
    "b"
].shape

In [None]:
params["blocks"][0]["ln_1"]["g"].shape, params["blocks"][0]["ln_1"]["b"].shape

In [None]:
params.keys()

In [None]:
def load_weights_into_gpt(gpt: GPTModel, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])

    for b in range(len(params["blocks"])):
        # Multi head Attention layer
        q_w, k_w, v_w = np.split(params["blocks"][b]["attn"]["c_attn"]["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T
        )
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T
        )
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T
        )

        q_b, k_b, v_b = np.split(params["blocks"][b]["attn"]["c_attn"]["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b
        )
        gpt.trf_blocks[b].att.W_key.bias = assign(gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b
        )

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T,
        )
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"],
        )

        # FF layer
        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T,
        )
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, params["blocks"][b]["mlp"]["c_fc"]["b"]
        )
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T,
        )
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"],
        )

        # Norm layers
        gpt.trf_blocks[b].sublayer1.norm.scale = assign(
            gpt.trf_blocks[b].sublayer1.norm.scale, params["blocks"][b]["ln_1"]["g"]
        )
        gpt.trf_blocks[b].sublayer1.norm.shift = assign(
            gpt.trf_blocks[b].sublayer1.norm.shift, params["blocks"][b]["ln_1"]["b"]
        )
        gpt.trf_blocks[b].sublayer2.norm.scale = assign(
            gpt.trf_blocks[b].sublayer2.norm.scale, params["blocks"][b]["ln_2"]["g"]
        )
        gpt.trf_blocks[b].sublayer2.norm.shift = assign(
            gpt.trf_blocks[b].sublayer2.norm.shift, params["blocks"][b]["ln_2"]["b"]
        )

        # Final norm and output layer weight
        gpt.final_ln.scale = assign(gpt.final_ln.scale, params["g"])
        gpt.final_ln.shift = assign(gpt.final_ln.shift, params["b"])
        gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

In [None]:
params["wte"].shape

In [None]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,  # Dropout rate
    "qkv_bias": True,  # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [None]:
from utils.gpt_model import GPTModel, load_weights_into_gpt

In [None]:
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)

In [None]:
# model.eval()

In [None]:
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=150,
    context_size=1024,
)

In [None]:
# print(token_ids_to_text(token_ids, tokenizer))

### Initialize the model with pretrained weights

In [None]:
CHOOSE_MODEl = "gpt2-small (124M)"
INPUT_PROMPT = "Evert effort moves"
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True,
}
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
BASE_CONFIG

In [None]:
from utils.gpt_download import download_and_load_gpt2
from utils.gpt_model import (
    generate_text_simple,
    GPTModel,
    load_weights_into_gpt,
    text_to_token_ids,
    token_ids_to_text,
)

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
model_size

In [None]:
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval()

In [None]:
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=15,
    context_size=BASE_CONFIG["context_length"],
)

print(token_ids_to_text(token_ids, tokenizer))

In [None]:
text_2 = """
    Is the following text 'spam'? Answer with 'yes' or 'no':
    'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.'
    Answer with 'yes' or 'no'.
"""
print(text_2)

In [None]:
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(text_2, tokenizer),
    max_new_tokens=15,
    context_size=BASE_CONFIG["context_length"],
)
print(token_ids_to_text(token_ids, tokenizer))

In [None]:
print(model)

In [None]:
len(list(model.parameters()))

In [None]:
# freeze the model
for param in model.parameters():
    param.requires_grad = False

In [None]:
torch.manual_seed(123)

num_classes = 2
model.out_head = nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes)

In [None]:
print(model)

In [None]:
for param in model.parameters():
    if param.requires_grad:
        print(param)

In [None]:
for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True

for param in model.final_ln.parameters():
    param.requires_grad = True

In [None]:
params = 0
for param in model.parameters():
    if param.requires_grad:
        params += 1

print(params)

In [None]:
inputs = tokenizer.encode("How are you doing")
inputs = torch.tensor(inputs).unsqueeze(0)
inputs.shape

In [None]:
with torch.no_grad():
    output = model(inputs)

print(output)
print(output.shape)

In [None]:
output[-1, :, :].shape

In [None]:
logits = output[-1, :, :]
label = torch.argmax(logits, dim=-1, keepdim=True)
label.shape

In [None]:
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)

            with torch.no_grad():
                logits = model(input_batch)[:, -1, :]  # (batch_size, 2)

            predicted_labels = torch.argmax(logits, dim=-1)  # (batch_size)
            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break

    accuracy = (correct_predictions / num_examples) * 100
    return f"{accuracy:.2f}"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.manual_seed(123)

train_accuracy = calc_accuracy_loader(train_loader, model, device)
valid_accuracy = calc_accuracy_loader(val_loader, model, device)
test_accuracy = calc_accuracy_loader(test_loader, model, device)

print(train_accuracy, valid_accuracy, test_accuracy)

In [None]:
for input_batch, label_batch in train_loader:
    # print(input_batch.to(device))
    print(label_batch.shape)
    logits = model(input_batch)[:, -1, :]
    print(logits.shape)
    loss = torch.nn.functional.cross_entropy(logits, label_batch)
    print(label_batch)
    print(loss)
    break

In [None]:
target = torch.randint(5, (3,), dtype=torch.int64)
target

In [None]:
import torch.nn.functional as F

In [None]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)[:, -1, :]
    loss = F.cross_entropy(logits, target_batch)
    return loss

In [None]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    loss = 0.0
    for i, (input_batch, target_batch) in enumerate(data_loader):
        # print(input_batch.shape, target_batch.shape)
        if i < num_batches:
            loss_batch = calc_loss_batch(input_batch, target_batch, model, device)
            loss += loss_batch.item()
        else:
            break

    return loss / num_batches

In [None]:
calc_loss_loader(train_loader, model, device, 10)

In [None]:
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, 5)
    val_loss = calc_loss_loader(val_loader, model, device, 5)
    test_loss = calc_loss_loader(test_loader, model, device, 5)

print(f"Train loss: {train_loss:.3f}")
print(f"valid loss: {val_loss:.3f}")
print(f"Test loss: {test_loss:.3f}")

### Finetune the model to improve the loss value

In [None]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(
            train_loader, model, device, num_batches=eval_iter
        )
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [None]:
def train_classifier_simple(
    model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter
):
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            examples_seen += input_batch.shape[0]
            global_step += 1

            # Optional eval step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(
                    f"Epoch {epoch + 1} step {global_step:06d}, train_loss: {train_loss:.3f}, val_loss: {val_loss:.3f}"
                )

        # calculate accuracy after evrry epoch
        train_accuracy = calc_accuracy_loader(
            train_loader, model, device, num_batches=eval_iter
        )
        val_accuracy = calc_accuracy_loader(
            val_loader, model, device, num_batches=eval_iter
        )
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)
        print(f"train accuracy: {train_accuracy}, val_accuracy: {val_accuracy}")

    return train_losses, val_losses, train_accs, val_accs, examples_seen

In [None]:
len([p for p in model.parameters() if p.requires_grad == True])

In [None]:
import time

start_time = time.time()
torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    num_epochs=num_epochs,
    eval_freq=50,
    eval_iter=5,
)

In [None]:
end_time = time.time()
train_duration = (end_time - start_time) / 60
print(f"Training completed in {train_duration:.2f} mins")

### Plot loss functions

In [None]:
len(train_losses), len(val_losses)

In [None]:
len(train_accs), len(val_accs)

In [None]:
len(train_loader), train_loader.batch_size, len(train_dataset)

In [None]:
(5 * 130) / 50  # length of loss array

In [None]:
examples_seen, (130 * 8 * 5)

In [None]:
epochs_tensor = torch.linspace(1, num_epochs, len(train_losses))
epochs_tensor

In [None]:
examples_seen_tensor = torch.linspace(1, examples_seen, len(train_losses))
examples_seen_tensor

In [None]:
train_losses, len(train_losses), examples_seen_tensor.shape

In [None]:
import matplotlib.pyplot as plt


def plot_values(epochs, examples_seen, train_values, val_values, label="loss"):
    fig, ax1 = plt.subplots(figsize=(5, 3))

    ax1.plot(epochs, train_values, label=f"Training {label}")
    ax1.plot(epochs, val_values, linestyle="-.", label=f"Validation {label}")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel(label.capitalize())
    ax1.legend()

    ax2 = ax1.twiny()  # create a second x-axis that shares the same y-axis
    ax2.plot(examples_seen, train_values, alpha=0)  # invisible plot
    ax2.set_xlabel("Examples seen")

    fig.tight_layout()
    plt.savefig(f"{label}-plot.pdf")
    plt.show()

In [None]:
plot_values(epochs_tensor, examples_seen_tensor, train_losses, val_losses)

In [None]:
epochs_tensor = torch.linspace(1, num_epochs, len(train_accs))
examples_seen_tensor = torch.linspace(1, examples_seen, len(train_accs))

In [None]:
epochs_tensor

In [None]:
examples_seen_tensor

In [None]:
plot_values(
    epochs_tensor,
    examples_seen_tensor,
    [float(p) for p in train_accs],
    [float(p) for p in val_accs],
    label="Accuracy",
)

In [None]:
[float(p) for p in val_accs]

In [None]:
# compute accuracy over the whole dataset

train_accuracy = calc_accuracy_loader(train_loader, model, device)
val_accuracy = calc_accuracy_loader(val_loader, model, device)
test_accuracy = calc_accuracy_loader(test_loader, model, device)

train_accuracy, val_accuracy, test_accuracy

### Use LLM as spam classifier

In [None]:
print(model)

In [None]:
model.pos_emb.weight.shape[0]

In [None]:
def classify_review(
    text, model, tokenizer, device, max_length=None, pad_token_id=50256
):
    model.eval()

    input_ids = tokenizer.encode(text)
    supported_context_length = model.pos_emb.weight.shape[
        0
    ]  # mistake in original implementation

    if max_length is None:
        max_length = supported_context_length
    else:
        max_length = min(max_length, supported_context_length)

    input_ids = input_ids[:max_length]
    input_ids += [pad_token_id] * (max_length - len(input_ids))
    # print(input_ids)
    input_tensor = torch.tensor(input_ids, device=device).unsqueeze(
        0
    )  # add batch dimension

    with torch.no_grad():
        logits = model(input_tensor)[
            :, -1, :
        ]  # logits of last output token Shape should be (1, 2)

    predicted_label = torch.argmax(logits, dim=-1).item()
    return "not spam" if predicted_label == 0 else "spam", logits

In [None]:
text_1 = "How are you doing"

classify_review(text_1, model, tokenizer, device, max_length=train_dataset.max_length)

In [None]:
text_2 = (
    "Hey, just wanted to check if we're still on" " for dinner tonight? Let me know!"
)

print(
    classify_review(
        text_2, model, tokenizer, device, max_length=train_dataset.max_length
    )
)

In [None]:
train_dataset.max_length

In [None]:
text_1 = (
    "You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award."
)

print(
    classify_review(
        text_1, model, tokenizer, device, max_length=train_dataset.max_length
    )
)

In [None]:
# save the model
torch.save(model.state_dict(), "spam_review_classifier.pth")

In [None]:
model_state_dict = torch.load("spam_review_classifier.pth")
model.load_state_dict(model_state_dict)

In [None]:
# max_length is quite important, having too many padding tokens will lead all inputs to be classified as spam.