# GPT2 for Text Classification

In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from minigpt.utils import set_seed
set_seed(3407)

from minigpt import bpe

In [2]:
# Load dataset
import pandas as pd
data = pd.read_csv('data/train.csv')
text = data.full_text
print(text[0][:200] + '...')

I think that students would benefit from learning at home,because they wont have to change and get up early in the morning to shower and do there hair. taking only classes helps them because at there ...


In [3]:
# Instantiate encoder
e = bpe.get_encoder()

In [4]:
# Let's organize our labels for training
data['labels'] = list(zip(data.cohesion.tolist(), data.syntax.tolist(),
                          data.vocabulary.tolist(), data.phraseology.tolist(),
                          data.grammar.tolist(), data.conventions.tolist()))
data.labels = data.labels.map(lambda x: list(x))
print(f'📐 Labels:\n {data["labels"].head()}\n')

# Let's clean the text a bit
data['text_encoded'] = data['full_text'].apply(lambda x: x.replace('\n', ' '))

# Now, let's encode the text using BPE class
data.text_encoded = data.full_text.map(lambda x: e.encode(x))
print(f'🔭 Encoded text:\n {data.text_encoded.head()}')

📐 Labels:
 0    [3.5, 3.5, 3.0, 3.0, 4.0, 3.0]
1    [2.5, 2.5, 3.0, 2.0, 2.0, 2.5]
2    [3.0, 3.5, 3.0, 3.0, 3.0, 2.5]
3    [4.5, 4.5, 4.5, 4.5, 4.0, 5.0]
4    [2.5, 3.0, 3.0, 3.0, 2.5, 2.5]
Name: labels, dtype: object

🔭 Encoded text:
 0    [40, 892, 326, 2444, 561, 4414, 422, 4673, 379...
1    [2215, 257, 1917, 318, 257, 1487, 345, 423, 28...
2    [20266, 11, 32641, 198, 198, 1532, 334, 1487, ...
3    [464, 1266, 640, 287, 1204, 318, 618, 345, 171...
4    [18712, 719, 286, 23887, 460, 2928, 287, 584, ...
Name: text_encoded, dtype: object


In [5]:
print(f'largest sequence length: {len(max(data.full_text))}')
print(f'smallest sequence length: {len(min(data.full_text))}')

largest sequence length: 938
smallest sequence length: 2160


In [6]:
# Create training, validation, and test sets
import random
random.seed(42)
random.shuffle(data.text_encoded)
n1 = int(0.8*len(data.text_encoded))
n2 = int(0.9*len(data.text_encoded))

Xtr = data.text_encoded[:n1]
Ytr = data.labels[:n1]
Xdev = data.text_encoded[n1:n2]
Ydev = data.labels[n1:n2]
Xte = data.text_encoded[n2:]
Yte = data.labels[n2:]

print(Xtr.shape, Ytr.shape)
print(Xdev.shape, Ydev.shape)
print(Xte.shape, Yte.shape)

(3128,) (3128,)
(391,) (391,)
(392,) (392,)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[i], x[j] = x[j], x[i]


In [7]:
class Loader(Dataset):
    
    def __init__(self, split, xtrain, ytrain):
        assert split in {'train', 'test'}
        self.split = split
        self.xtrain = xtrain
        self.ytrain = ytrain
        
    def __len__(self):
        return len(self.xtrain)
    
    def get_vocab_size(self):
        return 500
    
    def block_size(self):
        return 500
    
    def __getitem__(self, idx):
        return self.xtrain[idx], self.ytrain[idx]

In [8]:
train_dataset = Loader(split='train', xtrain=Xtr, ytrain=Ytr)
dev_dataset = Loader(split='test', xtrain=Xdev, ytrain=Ydev)
test_dataset = Loader(split='test', xtrain=Xte, ytrain=Yte)

# create a GPT instance
from minigpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = 500
model = GPT(model_config)

number of parameters: 0.13M


In [9]:
# create a Trainer object
from minigpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


In [11]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()