# GPT for Sequence Classification

In [None]:
# Imports
!pip install torchmetrics
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import trange
import random
from torchmetrics.classification import Recall, Accuracy, AUROC, Precision
from torch.utils.data.dataset import random_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Collecting torchmetrics
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.0 torchmetrics-1.2.1


In [None]:
# Pulling Data
!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
!unzip -o smsspamcollection.zip
!head -10 SMSSpamCollection

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
spam	H

In [None]:
# Initialize some variables
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = '[PAD]'
tokenizer.padding_side = "left" # Remember we want to use the last token's embedding to represent the entire sentence
val_ratio = 0.2
batch_size = 16 # Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
EOS_token=1
epochs = 2 # Recommended number of epochs: See: https://arxiv.org/pdf/1810.04805.pdf

# Use torchmetrics to set up accuracy, recall, precision, and auroc
accuracy = Accuracy(task = "binary")
recall =  Recall(task = "binary")
precision =  Precision(task = "binary")
auroc =  AUROC(task = "binary")

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

## Prepare the Vocabulary/Tokenize Text

In [None]:
file_path = 'SMSSpamCollection'
df = pd.DataFrame({'label':int(), 'text':str()}, index = [])
with open(file_path) as f:
    for line in f.readlines():
        split = line.split('\t')
        df = pd.concat([
                df,
                pd.DataFrame.from_dict({
                    'label': [1 if split[0] == 'spam' else 0],
                    'text': [split[1]]
                })
            ],
            ignore_index=True
        )
df.head()

text = df.text.values
labels = df.label.values

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...\n
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
def print_rand_sentence():
    '''Displays the tokens and respective IDs of a random text sample'''
    index = random.randint(0, len(text)-1)
    print(text[index])
    table = np.array([tokenizer.tokenize(text[index]),
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
    print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

Awww dat is sweet! We can think of something to do he he! Have a nice time tonight ill probably txt u later cos im lonely :( xxx.

╒════════════╤═════════════╕
│ Tokens     │   Token IDs │
╞════════════╪═════════════╡
│ A          │          32 │
├────────────┼─────────────┤
│ www        │        2503 │
├────────────┼─────────────┤
│ Ġdat       │        4818 │
├────────────┼─────────────┤
│ Ġis        │         318 │
├────────────┼─────────────┤
│ Ġsweet     │        6029 │
├────────────┼─────────────┤
│ !          │           0 │
├────────────┼─────────────┤
│ ĠWe        │         775 │
├────────────┼─────────────┤
│ Ġcan       │         460 │
├────────────┼─────────────┤
│ Ġthink     │         892 │
├────────────┼─────────────┤
│ Ġof        │         286 │
├────────────┼─────────────┤
│ Ġsomething │        1223 │
├────────────┼─────────────┤
│ Ġto        │         284 │
├────────────┼─────────────┤
│ Ġdo        │         466 │
├────────────┼─────────────┤
│ Ġhe        │         339 │

In [None]:
def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  # Use the tokenizer and the encode_plus methods to return the right data we'll need
  # Set max_length = 32 and return_tokens = 'pt'
  # Set other fields to the appropriate booleans needed
  return tokenizer.encode_plus(input_text, max_length = 32 , return_tensors = 'pt')

In [None]:
token_id = []
attention_masks = []
for sample in text:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids'])
    attention_masks.append(encoding_dict['attention_mask'])

# # Gather all the torch_id, attention masks, and labels
token_id = [token.squeeze() for token in token_id]
attention_masks = [attention.squeeze() for attention in attention_masks]
labels = labels

In [None]:
def print_rand_sentence_encoding():
    '''Displays tokens, token IDs and attention mask of a random text sample'''
    index = random.randint(0, len(text) - 1)
    tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
    print(tokens)
    token_ids = [i.numpy() for i in token_id[index]]
    attention = [i.numpy() for i in attention_masks[index]]
    table = np.array([tokens, token_ids, attention]).T
    print(
        tabulate(
            table,
            headers = ['Tokens', 'Token IDs', 'Attention Mask'],
            tablefmt = 'fancy_grid')
    )

print_rand_sentence_encoding()

['Hey', 'Ġleave', 'Ġit', '.', 'Ġnot', 'Ġa', 'Ġbig', 'Ġdeal', ':-', ')', 'Ġtake', 'Ġcare', '.', 'Ċ']
╒══════════╤═════════════╤══════════════════╕
│ Tokens   │   Token IDs │   Attention Mask │
╞══════════╪═════════════╪══════════════════╡
│ Hey      │       10814 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġleave   │        2666 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġit      │         340 │                1 │
├──────────┼─────────────┼──────────────────┤
│ .        │          13 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġnot     │         407 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġa       │         257 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġbig     │        1263 │                1 │
├──────────┼─────────────┼──────────────────┤
│ Ġdeal    │        1730 │                1 │
├──────────┼─────────────┼──────────────────┤
│ :-       │       21912 │

In [None]:
# Split the data into training and testing sets
num_train = int(len(token_id) * 0.8)
train, valid = random_split(token_id, [num_train, len(token_id) - num_train])
train_dataloader = DataLoader(train, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid, batch_size=batch_size, shuffle=True)

## Define the model: GPT Transformer


In [None]:
# Load the GPTForSequenceClassification model
# Set to 'gpt2' (the smallest GPT2 which is 120 M parameters)
# Do not ouput the attentions and all hidden states
config =  GPT2Config()

# Use the config above and set other labels as needed
model = GPT2ForSequenceClassification(config)

# Set the pad token id to the eos token id
model.config.pad_token_id = EOS_token

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5
# See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 5e-5,
    eps = 1e-08
)
# Set the model to the right device
model = model.to(device)

In [None]:
# Print all the layers of this GPT2 model and the number of parameters per layer
fine_tune = True

for name, m in model.named_modules():
  for e in m.parameters():
    if e.requires_grad==True:
      print(name,e.numel())

total_parameters = np.sum([param.numel() for param in model.parameters()] )
assert(total_parameters == 124441344)

## Training the Model/ Model Execution

In [None]:
# Main training / validation loop
for _ in trange(epochs, desc = 'Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        # Put each element of batch onto the device
        # Unpack the batch
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        # Set gradients to zero
        optimizer.zero_grad()

        # Forward pass
        train_output = model(batch**)

        # Backward pass
        loss =criterion(train_output,label)

        # Do back propagation and get the gradients
        loss.backward()


        # Update tracking variables
        tr_loss += loss
        nb_tr_examples += len(labels)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_auroc = []

    for batch in validation_dataloader:
        # Unpack the batch
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        with torch.no_grad():
          # Forward pass
          eval_output = model(batch**)

        # Calculate validation metrics
        loss, logits = outputs[:2]
        labels = batch['labels'].numpy().flatten().tolist()
        predicted_labels =logits.argmax(axis=-1).flatten().tolist()

        val_accuracy.append(accuracy(predicted_labels,labels))
        val_recall.append(recall(predicted_labels,labels))
        val_precision.append(precision(predicted_labels,labels))
        val_auroc.append(auroc(predicted_labels,labels))

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)))
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)))
    print('\t - Validation AUROC: {:.4f}\n'.format(sum(val_auroc)/len(val_auroc)))

## Model Validation:

In [None]:
new_sentence = 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
    output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Ham'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)