## Stage 1: Dataset preparation

<img src="https://camo.githubusercontent.com/9059a83fc1e6b03d55f9bfbb2a8ce7252f3ca5c0c88b1e5d4ddd859e1a13c771/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830365f636f6d707265737365642f6f766572766965772d312e77656270" width=700px>

In [1]:
import urllib.request
import zipfile
import os
import pandas as pd
import tiktoken
from pathlib import Path

### 1. Download the dataset

In [2]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
    print(f"Primary URL failed: {e}. Trying backup URL...")
    url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) 

sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [3]:
df = pd.read_csv(data_file_path,
                 sep='\t',
                 header=None,
                 names=['Label', 'Text'])
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
print(df['Label'].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


### 2. Preprocess the dataset

This is an unbalanced dataset, we choose to undersample the dataset to include 747 instances from each class.

In [5]:
def create_balanced_dataset(df):
    num_spam = df[df['Label'] == 'spam'].shape[0]
    
    # Randomly sample "ham" instances to match the number of "spam" instances
    ham_subset = df[df['Label'] == 'ham'].sample(num_spam, random_state=123)

    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])
    return balanced_df

In [6]:
balanced_df = create_balanced_dataset(df)
balanced_df['Label'].value_counts()

Label
ham     747
spam    747
Name: count, dtype: int64

In [7]:
# change 'ham' and 'spam' to 0 and 1 respectively
# > run this cell only once!!
balanced_df['Label'] = balanced_df['Label'].map({'ham': 0, 'spam': 1})
balanced_df

Unnamed: 0,Label,Text
4307,0,Awww dat is sweet! We can think of something t...
4138,0,Just got to &lt;#&gt;
4831,0,"The word ""Checkmate"" in chess comes from the P..."
4461,0,This is wishing you a great day. Moji told me ...
5440,0,Thank you. do you generally date the brothas?
...,...,...
5537,1,Want explicit SEX in 30 secs? Ring 02073162414...
5540,1,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,1,Had your contract mobile 11 Mnths? Latest Moto...
5566,1,REMINDER FROM O2: To get 2.50 pounds free call...


In [8]:
def random_split(df, train_frac, validation_frac):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

In [9]:
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)

In [10]:
# save these dataframes as CSV
train_df.to_csv('train.csv', index=None)
validation_df.to_csv('validation.csv', index=None)
test_df.to_csv('test.csv', index=None)

### 3. Creating data loaders

<img src="https://camo.githubusercontent.com/ece05402ac93677bf7e055dff15138dc28198db2ca2dcd5908a706b9056c7249/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830365f636f6d707265737365642f7061642d696e7075742d73657175656e6365732e776562703f313233" width=700>

In [11]:
tokenizer = tiktoken.get_encoding('gpt2')
print(tokenizer.encode('<|endoftext|>', allowed_special={"<|endoftext|>"}))

[50256]


---

We first create the dataset class.

In [12]:
import torch
from torch.utils.data import Dataset

In [13]:
class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        self.encoded_texts = [tokenizer.encode(text) for text in self.data['Text']]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            self.encoded_texts = [encoded_text[:self.max_length] for encoded_text in self.encoded_texts]

        self.encoded_texts = [encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
                              for encoded_text in self.encoded_texts]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]['Label']

        return (torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long))

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length

        return max_length

In [14]:
train_dataset = SpamDataset(csv_file='train.csv', tokenizer=tokenizer, max_length=None)
train_dataset.max_length

120

In [15]:
val_dataset = SpamDataset(csv_file='validation.csv', tokenizer=tokenizer, max_length=train_dataset.max_length)

test_dataset = SpamDataset(csv_file='test.csv', tokenizer=tokenizer, max_length=train_dataset.max_length)

---

<img src="https://camo.githubusercontent.com/48d9b0183d7f0b78c3179eb8edbdc1a84ac287d577c538a83e93d987bf1bdc51/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830365f636f6d707265737365642f62617463682e77656270" width=600px>

Then we create PyTorch data loaders.

In [16]:
from torch.utils.data import DataLoader

In [17]:
num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=num_workers,
                          drop_last=True)
val_loader = DataLoader(dataset=val_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=num_workers,
                          drop_last=True)
test_loader = DataLoader(dataset=test_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=num_workers,
                          drop_last=True)

In [18]:
for input_batch, target_batch in train_loader:
    pass

print(input_batch.shape, target_batch.shape)

torch.Size([8, 120]) torch.Size([8])


In [19]:
print(f'{len(train_loader)=}, {len(val_loader)=}, {len(test_loader)=} ')

len(train_loader)=130, len(val_loader)=18, len(test_loader)=37 


---

## Stage 2 : Model setup

<img src="https://camo.githubusercontent.com/80786c4d1d84976de2d804fd42bbae50157e4474f583d17a6f41aa0a2de8cf12/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830365f636f6d707265737365642f6f766572766965772d322e77656270" width=700>

### 4. Initializing a model with pretrained weights

In [20]:
CHOOSE_MODEL = 'gpt2-small (124M)'
INPUT_PROMPT = "Every effort moves you"
BASE_CONFIG  = {
    "vocab_size"     : 50257,
    "context_length" : 1024,
    "drop_rate"      : 0.0,
    "qkv_bias"       : True
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

assert train_dataset.max_length <= BASE_CONFIG["context_length"], (
    f"Dataset length {train_dataset.max_length} exceeds model's context "
    f"length {BASE_CONFIG['context_length']}. Reinitialize data sets with "
    f"`max_length={BASE_CONFIG['context_length']}`"
)

In [21]:
from gpt_download import download_and_load_gpt2
from llms_from_scratch.ch04 import GPTModel
from llms_from_scratch.ch05 import download_and_load_gpt2, load_weights_into_gpt

In [25]:
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


In [26]:
from llms_from_scratch.ch04 import generate_text_simple
from llms_from_scratch.ch05 import text_to_token_ids, token_ids_to_text

In [28]:
text_1 = "Every effort moves you"

token_ids = generate_text_simple(model,
                                 text_to_token_ids(text_1, tokenizer),
                                 max_new_tokens=15,
                                 context_size=BASE_CONFIG['context_length'])

print(token_ids)
print(token_ids_to_text(token_ids, tokenizer))

tensor([[6109, 3626, 6100,  345, 2651,   13,  198,  198,  464,  717, 2239,  318,
          284, 1833,  262, 6817,  286,  534,  670]])
Every effort moves you forward.

The first step is to understand the importance of your work


In [30]:
text_2 = (
    "Is the following text 'spam'? Answer with 'yes' or 'no':"
    " 'You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award.'"
)

token_ids = generate_text_simple(model=model,
                                 idx=text_to_token_ids(text_2, tokenizer),
                                 max_new_tokens=23,
                                 context_size=BASE_CONFIG["context_length"])

print(token_ids)
print(token_ids_to_text(token_ids, tokenizer))

tensor([[ 3792,   262,  1708,  2420,   705,  2777,   321, 30960, 23998,   351,
           705,  8505,     6,   393,   705,  3919, 10354,   705,  1639,   389,
           257,  8464,   345,   423,   587, 20905,  6163,   284,  3328,   720,
         12825,  5003,   393,   257,   720, 11024,  5764,  2637,   198,   198,
           464,  1708,  2420,   705,  2777,   321, 30960, 23998,   351,   705,
          8505,     6,   393,   705,  3919, 10354,   705,  1639,   389,   257,
          8464]])
Is the following text 'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.'

The following text 'spam'? Answer with 'yes' or 'no': 'You are a winner


The model is **not** following our instructions because it lacks instructions fine-tuning. 💀💀

### 5. Adding a classification head

<img src="https://camo.githubusercontent.com/424818dc46ac91f5b7e1d37a59f9e2374deacfa04348569e4706e75f769de6ee/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830365f636f6d707265737365642f6c6d2d686561642e77656270" width=700>

In [31]:
model

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768,