# Import

In [34]:
import pandas as pd
import os
import torch
import tiktoken
from llm.previous_chapters import *

if torch.cuda.is_available():
    print("gpu available")
else:
    print("no gpu")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

gpu available


In [5]:
root_dataset_path = os.path.join(os.getcwd(), "datasets", "spam_dataset")
dataset_path = os.path.join(root_dataset_path, "SMSSpamCollection")
assert os.path.exists(dataset_path), f"path to dataset not exists {dataset_path}"

In [6]:
df = pd.read_csv(dataset_path, sep="\t", header=None, names =['label', 'text'])
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
#undersample for testing
def create_balanced_dataset(df: pd.DataFrame):
    num_spam = df[df["label"] == "spam"].shape[0]
    ham_subset = df[df["label"] == "ham"].sample(num_spam, random_state=123)
    #make # of ham == # spam
    balanced_df = pd.concat([ham_subset, df[df["label"] == 'spam']])
    return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df['label'].value_counts())

label
ham     747
spam    747
Name: count, dtype: int64


In [8]:
def random_split(df, train_frac, valid_frac):
    #shuffle
    df = df.sample(frac = 1, random_state = 123).reset_index(drop=True)

    train_idx = int(train_frac * len(df))
    val_idx = train_idx + int(valid_frac * len(df))
    #the rest is test

    train_df = df[:train_idx]
    val_df = df[train_idx:val_idx]
    test_df = df[val_idx:]

    return train_df, val_df, test_df

train_df, val_df, test_df = random_split(df, 0.7, 0.1)

print("train df shape = ", train_df.shape)
print("valid df shape = ", val_df.shape)
print("test df shape = ", test_df.shape)

train_df.to_csv(os.path.join(root_dataset_path, "sample_train.csv"), index = False)
val_df.to_csv(os.path.join(root_dataset_path, "sample_val.csv"), index = False)
test_df.to_csv(os.path.join(root_dataset_path, "sample_test.csv"), index = False)

train df shape =  (3900, 2)
valid df shape =  (557, 2)
test df shape =  (1115, 2)


In [9]:
#Map label
balanced_df['label'] = balanced_df['label'].map({'ham': 0, 'spam': 1})
balanced_df.head()

Unnamed: 0,label,text
4307,0,Awww dat is sweet! We can think of something t...
4138,0,Just got to &lt;#&gt;
4831,0,"The word ""Checkmate"" in chess comes from the P..."
4461,0,This is wishing you a great day. Moji told me ...
5440,0,Thank you. do you generally date the brothas?


# Pad tokens with longest token len

In [10]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
pad_id = tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})[0]
print(pad_id)

50256


# Build dataset and dataloader

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

In [56]:
a = [1,2]
a = a + [0] * 3
print(a)

[1, 2, 0, 0, 0]


In [57]:
class SpamDataset(Dataset):

    def __init__(self, csv, tokenizer, pad_token_id, max_len = 200):
        self.df = pd.read_csv(csv)
        self.df.drop(0, inplace=True)
        self.df['label'] = self.df['label'].map({'ham': 0, 'spam': 1})
        self.pad_token_id = pad_token_id
        self.max_len = max_len


        self.encoded_text = [self._pad_tokens(tokenizer.encode(text)) for text in self.df['text']]

    def _pad_tokens(self, tokens):
        tokens = tokens + [self.pad_token_id] * (self.max_len - len(tokens))
        return tokens[:self.max_len]
    
    def __getitem__(self, index):
        X = self.encoded_text[index]
        y = self.df.iloc[index]['label']
        
        return (torch.tensor(X).long(), torch.tensor(y).long())
    
    def __len__(self):
        return len(self.encoded_text)
    
    def get_dataloader(self, batch_size, num_workers, drop_last = False):
        return DataLoader(dataset = self, batch_size=batch_size, num_workers=num_workers, drop_last = drop_last)

In [58]:
train_dataset = SpamDataset(os.path.join(root_dataset_path, "sample_train.csv"), tokenizer, pad_id, max_len = 100)
train_dataset.df.head()
print(train_dataset[0])
print(len(train_dataset))


(tensor([ 5122,  1736,  1077,   415, 17442,   272,   338,  2802,  3804,  1497,
          938,  1755,    13, 12472,   329,   607,   290,  1641,    13, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]), tensor(0))
3899


In [59]:
val_dataset = SpamDataset(os.path.join(root_dataset_path, "sample_val.csv"), tokenizer, pad_id, max_len = 100)
print(len(val_dataset))
test_dataset = SpamDataset(os.path.join(root_dataset_path, "sample_test.csv"), tokenizer, pad_id, max_len = 100)
print(len(test_dataset))

556
1114


In [64]:
#Get dataloader
batch_size = 8
num_workers = 0

train_loader = train_dataset.get_dataloader(batch_size=batch_size, num_workers=num_workers, drop_last=True) #drop last unmatched element for training
val_loader = val_dataset.get_dataloader(batch_size=batch_size, num_workers=num_workers, drop_last=False)
test_loader = test_dataset.get_dataloader(batch_size=batch_size, num_workers=num_workers, drop_last=False)

print("train loader len = ", len(train_loader))
sample = next(iter(train_loader))
print(f"X shape = {sample[0].shape}, y shape = {sample[1].shape}")
print("val loader len = ", len(val_loader))
print("test loader len = ", len(test_loader))

train loader len =  487
X shape = torch.Size([8, 100]), y shape = torch.Size([8])
val loader len =  70
test loader len =  140


In [66]:
# Just checking the correctness of loader 
xshape = None
yshape = None
for i, (X,y) in enumerate(train_loader):
    if xshape == None or yshape == None: 
        xshape = X.shape
        yshape = y.shape
    else:
        if xshape != X.shape: raise Exception(f"shape not consistent, new shape = {X.shape}, old shape = {xshape}")



In [63]:
print("train loader len = ", len(train_loader))
print("val loader len = ", len(val_loader))
print("test loader len = ", len(test_loader))

train loader len =  487
val loader len =  70
test loader len =  140


# Loading pretrained model. 
Code in Chapt 5, load from book's code, my GPT code is messed up
because my GPTCode is slightly different than theirs

In [17]:
BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}
CHOOSE_MODEL = "gpt2-small (124M)"

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [18]:
gpt = torch.load(os.path.join("output", 'gpt.torch'))

In [19]:
text = "we are the world" 
tokens = text_to_token_ids(text, tokenizer)

output_tokens = generate(gpt, tokens, max_new_tokens=10, context_size=BASE_CONFIG["context_length"], temperature=2.0, top_k=10)

decoded = token_ids_to_text(output_tokens, tokenizer)

print(decoded)

we are the world's only two states that can afford the kind of


In [20]:
# Sample instruction
text2 = ("Is the following spam, answer with yes or no", 
"'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.'")

tokens = text_to_token_ids(text, tokenizer)

output_tokens = generate(gpt, tokens, max_new_tokens=10, context_size=BASE_CONFIG["context_length"], temperature=2.0, top_k=10)

decoded = token_ids_to_text(output_tokens, tokenizer)

print(decoded)

we are the world's largest producer and supplier. The company employs nearly


In [74]:

def load_model(path):
    gpt = torch.load(path)

    num_classes = 2
    #Freeze model params for finetuning
    for param in gpt.parameters():
        param.requires_grad = False

    gpt.out_head = nn.Linear(
        BASE_CONFIG["emb_dim"],
        out_features=num_classes
    )

    #NOTE from the book: 
    # training the output layer we just added is sufficient. 
    # However, as I found in experiments, finetuning additional layers can 
    # noticeably improve the predictive performance of the finetuned model.

    for p in gpt.trf_blocks[-1].parameters():
        p.requires_grad = True
    for p in gpt.final_norm.parameters():
        p.requires_grad = True

    gpt.to(device)
    return gpt
    

In [37]:
# test new model architecture
text = [
    "This definitely a spam email", 
    "this is not spam email"
    ]
ids = [text_to_token_ids(t, tokenizer).squeeze(0) for t in text]
ids = torch.vstack(ids).to(device)

print('input shape = ', ids.shape)

with torch.no_grad():
    preds = gpt(ids)

print('output shape = ', preds.shape)
# we want to use last output token [-1] to get optimize
# probas = torch.softmax(preds[:, -1, :], dim = -1)
# print(f"probas ({probas.shape})= ", probas)
logits = preds[:, -1, :]
label = torch.argmax(logits, dim = -1)

print('labels = ', label)

input shape =  torch.Size([2, 5])
output shape =  torch.Size([2, 5, 2])
labels =  tensor([1, 1], device='cuda:0')


# Evaluation function

In [43]:
def calc_accuracy_loader(dataloader:DataLoader, model:GPTModel, device, num_batches = None):
    model.eval()

    correct_pred, num_examples = 0,0

    if num_batches != None: 
        num_batches = min(num_batches, len(dataloader))
    else:
        num_batches = len(dataloader)
    
    for i, (X, y) in enumerate(dataloader):
        if i >= num_batches: break

        X, y = X.to(device), y.to(device)

        with torch.no_grad():
            logits = model(X)[:,-1,:]
        
        preds = torch.argmax(logits, dim = -1)

        num_examples += preds.shape[0]

        correct_pred += (preds == y).sum().item()
    
    return correct_pred / num_examples

def calc_loss_loader(dataloader:DataLoader, model:GPTModel, device, num_batches = None):
    model.eval()

    total_loss, num_examples = 0,0

    if num_batches != None: 
        num_batches = min(num_batches, len(dataloader))
    else:
        num_batches = len(dataloader)
    
    for i, (X, y) in enumerate(dataloader):
        if i >= num_batches: break

        X, y = X.to(device), y.to(device)

        with torch.no_grad():
            logits = model(X)[:,-1,:]

        num_examples += preds.shape[0]
        total_loss  += calc_loss_batch(X, y, model, device)
    
    return total_loss / num_examples

def calc_loss_batch(X: torch.tensor, y, model: torch.tensor, device: GPTModel):
    X, y = X.to(device), y.to(device)
    logits = model(X)[:, -1, :]
    loss = torch.nn.functional.cross_entropy(logits, y)
    return loss


def eval_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():

        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    
    model.train()
    return train_loss, val_loss

In [39]:
train_accuracy = calc_accuracy_loader(train_loader, gpt, device = device, num_batches= 10)
print("train acc = ", train_accuracy)

train acc =  0.1375


In [40]:
X, y = next(iter(train_loader))
train_loss = calc_loss_batch(X, y, gpt, device = device)
print("train loss = ", train_loss)

train loss =  tensor(4.8201, device='cuda:0', grad_fn=<NllLossBackward0>)


In [44]:
train_loss = calc_loss_loader(train_loader, gpt, device = device, num_batches= 10)
print("train loss = ", train_loss)

train loss =  tensor(2.3591, device='cuda:0', grad_fn=<DivBackward0>)


In [45]:
train_loss, val_loss = eval_model(gpt, train_loader,  val_loader, device, eval_iter=5)
print('train loss = ', train_loss)
print('val loss = ', val_loss)

train loss =  tensor(2.4640, device='cuda:0')
val loss =  tensor(2.1279, device='cuda:0')


# Training loop

In [67]:
from tqdm import tqdm

def train(
    model: GPTModel, train_loader: DataLoader, val_loader: DataLoader, 
    optimizer, device, tokenizer, num_epochs = 10, eval_freq = 1, eval_iter = 1):

    train_losses, val_losses, train_accs, val_accs = [],[],[],[]
    examples_seen, global_steps = 0,-1
    loop = tqdm(range(num_epochs))

    try:
        for e in loop:

            model.train()

            for X,y in train_loader:
                optimizer.zero_grad()
                loss = calc_loss_batch(X, y, model, device)
                loss.backward()
                optimizer.step()

                examples_seen += X.shape[0]

                global_steps += 1
            
                if global_steps % eval_freq == 0: 
                    train_loss, val_loss = eval_model(model, train_loader, val_loader, device, eval_iter)

                    train_losses.append(train_loss)
                    val_losses.append(val_loss)
                
                    print(f"epochs = {e}, global step ={global_steps}, train_loss={train_loss:.3f}, val_loss={val_loss:.3f}")
                
            train_acc = calc_accuracy_loader(train_loader, model, device,num_batches=eval_iter)
            val_acc = calc_accuracy_loader(val_loader, model, device,num_batches=eval_iter)

            loop.set_description(f"train acc = {train_acc:.3f}, val_acc = {val_acc:.3f}")

            train_accs.append(train_acc)
            val_accs.append(val_acc)
    except Exception as e:
        print('x shape = ', X.shape)
        print('y shape = ', y.shape)
        raise e 
    
    return train_losses, val_losses, train_accs, val_accs, examples_seen


In [77]:
import time

gpt = load_model(os.path.join("output", 'gpt.torch'))
start_time = time.time()
optimizer = torch.optim.AdamW(gpt.parameters(), lr = 5e-5, weight_decay=0.1)
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
print('device = ', device)

num_epochs = 5

train_losses, val_losses, train_accs, val_accs, examples_seen = \
train(gpt, train_loader, val_loader, optimizer = optimizer, 
tokenizer=tokenizer, device = device, eval_freq = 100, eval_iter = 10, num_epochs=3)


device =  cuda


  0%|          | 0/3 [00:00<?, ?it/s]

epochs = 0, global step =0, train_loss=0.413, val_loss=0.412
epochs = 0, global step =100, train_loss=0.187, val_loss=0.184
epochs = 0, global step =200, train_loss=0.120, val_loss=0.115
epochs = 0, global step =300, train_loss=0.058, val_loss=0.023
epochs = 0, global step =400, train_loss=0.029, val_loss=0.029


train acc = 0.988, val_acc = 0.988:  33%|███▎      | 1/3 [00:26<00:52, 26.48s/it]

epochs = 1, global step =500, train_loss=0.028, val_loss=0.028
epochs = 1, global step =600, train_loss=0.027, val_loss=0.039
epochs = 1, global step =700, train_loss=0.026, val_loss=0.030
epochs = 1, global step =800, train_loss=0.026, val_loss=0.032
epochs = 1, global step =900, train_loss=0.026, val_loss=0.037


train acc = 0.975, val_acc = 0.975:  67%|██████▋   | 2/3 [00:52<00:26, 26.50s/it]

epochs = 2, global step =1000, train_loss=0.022, val_loss=0.024
epochs = 2, global step =1100, train_loss=0.020, val_loss=0.028
epochs = 2, global step =1200, train_loss=0.029, val_loss=0.007
epochs = 2, global step =1300, train_loss=0.020, val_loss=0.023
epochs = 2, global step =1400, train_loss=0.019, val_loss=0.018


train acc = 0.975, val_acc = 0.963: 100%|██████████| 3/3 [01:19<00:00, 26.51s/it]


In [None]:
import matplotlib.pyplot as plt

def plot_values(epochs_seen, examples_seen, train_values, val_values, label="loss"):
    fig, ax1 = plt.subplots(figsize=(5, 3))
                                                                  #A
    ax1.plot(epochs_seen, train_values, label=f"Training {label}")
    ax1.plot(epochs_seen, val_values, linestyle="-.", label=f"Validation {label}")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel(label.capitalize())
    ax1.legend()
#B
    ax2 = ax1.twiny()
    ax2.plot(examples_seen, train_values, alpha=0)  # Invisible plot for aligning ticks
    ax2.set_xlabel("Examples seen")
    fig.tight_layout()                                            #C
    plt.savefig(f"{label}-plot.pdf")
    plt.show()

In [None]:
plot_values(len(train_accs), examples_seen, train_accs, val_accs, label = "accuracy")
plot_values(len(train_accs), examples_seen, train_losses, val_losses, label = "loss")