In [23]:
import pandas as pd
import tiktoken

# Data Preprocessing

In [41]:
# Downloaded from "SMS Spam Collection" UC Irvine
spam_df = pd.read_csv('spam.csv', sep=',', header=None, names=['Label', 'Text', 'a', 'b', 'c'], encoding_errors='ignore')
spam_df

Unnamed: 0,Label,Text,a,b,c
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will _ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [42]:
print(spam_df['Label'].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


In [43]:
spam_df['Label'] = spam_df['Label'].map({'ham': 0, 'spam': 1})

# training: 70%
# validation: 10%
# testing: 20%
def random_split(df, train_frac, validation_frac):
  # Shuffle
  df = df.sample(frac=1, random_state=123).reset_index(drop=True)

  train_end = int(len(df) * train_frac)
  validation_end = train_end + int(len(df) * validation_frac)

  return df[:train_end], df[train_end:validation_end], df[validation_end:]

train_df, validation_df, test_df = random_split(spam_df, 0.7, 0.1)


In [45]:
print(f'Train dataset: len={len(train_df)}')
print(f'Validation dataset: len={len(validation_df)}')
print(f'Test dataset: len={len(test_df)}')

Train dataset: len=3900
Validation dataset: len=557
Test dataset: len=1115


In [46]:
# Save the data to csv files
train_df.to_csv('train.csv', index=None)
validation_df.to_csv('validation.csv', index=None)
test_df.to_csv('test.csv', index=None)

# Dataset and Dataloader

In [47]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
  def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
    self.data = pd.read_csv(csv_file)
    self.encoded_text = [tokenizer.encode(text) for text in self.data['Text']]

    if max_length is None:
      self.max_length = self._longest_encoded_length()
    else:
      self.max_length = max_length
      self.encoded_text = [
          text[:self.max_length] for text in self.encoded_text
      ]
    # Pad sequences to the longest sequence
    self.encoded_text = [
        text + [pad_token_id] * (self.max_length - len(text))
        for text in self.encoded_text
    ]

  def _longest_encoded_length(self):
    longest_length = 0
    for text in self.encoded_text:
      longest_length = max(longest_length, len(text))
    return longest_length

  def __getitem__(self, index):
    encoded = self.encoded_text[index]
    label = self.data.iloc[index]['Label']
    return (
        torch.tensor(encoded, dtype=torch.long),
        torch.tensor(label, dtype=torch.long)
    )

  def __len__(self):
    return len(self.data)


In [48]:
tokenizer = tiktoken.get_encoding('gpt2')

In [49]:
train_dataset = SpamDataset(csv_file="train.csv", max_length=None, tokenizer=tokenizer)
print(train_dataset.max_length)
print(train_dataset.__len__())

257
3900


In [50]:
validation_dataset = SpamDataset(csv_file='validation.csv', max_length=train_dataset.max_length, tokenizer=tokenizer)
print(validation_dataset.max_length)
print(validation_dataset.__len__())

257
557


In [51]:
test_dataset = SpamDataset(csv_file='test.csv', max_length=train_dataset.max_length, tokenizer=tokenizer)
print(test_dataset.max_length)
print(test_dataset.__len__())

257
1115


In [52]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True)
validation_loader = DataLoader(
    dataset=validation_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=False)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=False)


In [54]:
print('Train dataloader')
for input, target in train_loader:
  pass
print(input)
print(input.shape)
print(target)

Train dataloader
tensor([[25374, 41649, 34509,  ..., 50256, 50256, 50256],
        [10814,   986,  9576,  ..., 50256, 50256, 50256],
        [39274,   337,  1546,  ..., 50256, 50256, 50256],
        ...,
        [ 8642,    23,    13,  ..., 50256, 50256, 50256],
        [   44,  6996, 33826,  ..., 50256, 50256, 50256],
        [ 2061,  1645,   284,  ..., 50256, 50256, 50256]])
torch.Size([8, 257])
tensor([0, 0, 1, 0, 0, 0, 0, 0])


In [55]:
print(len(train_loader))
print(len(validation_loader))
print(len(test_loader))

487
70
140


# Load Pre-trained GPT Model

In [None]:
# TODO

# Update the model architecture

In [None]:
# Replace the output head
# We could technically use a single output head, but that requires modifying the loss function.
# We choose a more general approach where the number of output nodes matches the number of classes.

# First, freeze the model
for param in model.parameters():
  param.requires_grad = False

torch.manual_seed(123)

# This out_head has requires_grad = True by default
num_classes = 2
model.out_head = torch.nn.Linear(in_features=BASE_CONFIG['emb_dim'], out_features=num_classes)

# Unfree the last transformer block and the last layer norm
for param in model.trf_blocks[-1].parameters():
  param.requires_grad = True
for param in model.final_norm.parameters():
  param.requires_grad = True


In [None]:
# Test
inputs = tokenizer.encode("how are you?")
with torch.no_grad():
  outputs = model(inputs)

print(f"Output: {outputs}\ndimensions: {outputs.shape}")

# Loss Function