# GPT2 for Text Classification

In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from minigpt.utils import set_seed
from transformers import GPT2Tokenizer, GPT2Config, GPT2ForSequenceClassification

set_seed(3407)

from minigpt import bpe

In [2]:
# Load dataset
import pandas as pd
data = pd.read_csv('data/train.csv')
text = data.full_text
print(text[0][:200] + '...')

I think that students would benefit from learning at home,because they wont have to change and get up early in the morning to shower and do there hair. taking only classes helps them because at there ...


In [3]:
# get vocab size
raw_text = ''.join(data['full_text'].values.tolist())
print('Num of unique chars:\n', len(set(raw_text)))
del raw_text # to save memory

Num of unique chars:
 94


In [4]:
# Byte Pair Encoder
e = bpe.get_encoder()

In [5]:
# Let's organize our labels for training
data['labels'] = list(zip(data.cohesion.tolist(), data.syntax.tolist(),
                          data.vocabulary.tolist(), data.phraseology.tolist(),
                          data.grammar.tolist(), data.conventions.tolist()))

data.labels = data.labels.map(lambda x: list(x))
print(f'üìê Labels:\n {data["labels"].head()}\n')

# Let's clean the text a bit
data['full_text'] = data['full_text'].apply(lambda x: x.replace('\n', ' '))

# Now, let's encode the text using BPE class
data.text_encoded = data.full_text.map(lambda x: e.encode(x))
print(f'üî≠ Encoded text:\n {data.text_encoded.head()}')

üìê Labels:
 0    [3.5, 3.5, 3.0, 3.0, 4.0, 3.0]
1    [2.5, 2.5, 3.0, 2.0, 2.0, 2.5]
2    [3.0, 3.5, 3.0, 3.0, 3.0, 2.5]
3    [4.5, 4.5, 4.5, 4.5, 4.0, 5.0]
4    [2.5, 3.0, 3.0, 3.0, 2.5, 2.5]
Name: labels, dtype: object

üî≠ Encoded text:
 0    [40, 892, 326, 2444, 561, 4414, 422, 4673, 379...
1    [2215, 257, 1917, 318, 257, 1487, 345, 423, 28...
2    [20266, 11, 32641, 220, 1002, 334, 1487, 262, ...
3    [464, 1266, 640, 287, 1204, 318, 618, 345, 171...
4    [18712, 719, 286, 23887, 460, 2928, 287, 584, ...
Name: full_text, dtype: object


  data.text_encoded = data.full_text.map(lambda x: e.encode(x))


In [6]:
print(f'largest sequence length: {len(max(data.text_encoded))}')
print(f'smallest sequence length: {len(min(data.text_encoded))}')

largest sequence length: 590
smallest sequence length: 295


In [8]:
# GPT2 takes a maximum of 1028 tokens. Anything above that will cause index errors.
# So, Let's remove all samples above a given threshold
idxs = [i for i,j in enumerate(data.text_encoded) if len(j) < 800]

text = data.text_encoded[idxs].reset_index(drop=True)
labels = data.cohesion[idxs].reset_index(drop=True)

In [9]:
# Create training, validation, and test sets
import random
random.seed(42)
random.shuffle(text)
n1 = int(0.8*len(text))
n2 = int(0.9*len(text))

Xtr = text[:n1]
Ytr = labels[:n1]
Xdev = text[n1:n2]
Ydev = labels[n1:n2]
Xte = text[n2:]
Yte = labels[n2:]

print(Xtr.shape, Ytr.shape)
print(Xdev.shape, Ydev.shape)
print(Xte.shape, Yte.shape)

(2799,) (2799,)
(350,) (350,)
(350,) (350,)


In [11]:
class DataLoads(Dataset):
    
    def __init__(self, X, Y):
        self.x = X
        self.y = Y
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        # inputs to the transformer
        X = torch.tensor(self.x[idx])
        Y = torch.tensor(self.y[idx])
        mask = torch.ones(len(self.x[0])).float()
        
        return X, Y, mask


In [12]:
train_dataset = DataLoads(Xtr, Ytr)
dev_dataset = DataLoads(Xdev, Ydev)
test_dataset = DataLoads(Xtr, Ytr)

In [14]:
# create a GPT instance
from minigpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = 500
model = GPT(model_config)

In [None]:
# create a Trainer object
from minigpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

In [None]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()