# GPT2 for Text Classification

In [5]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from minigpt.utils import set_seed
from transformers import GPT2Tokenizer, GPT2Config, GPT2ForSequenceClassification

set_seed(3407)

# from minigpt import bpe

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# Load dataset
import pandas as pd
data = pd.read_csv('data/train.csv')
text = data.full_text
print(text[0][:200] + '...')

I think that students would benefit from learning at home,because they wont have to change and get up early in the morning to shower and do there hair. taking only classes helps them because at there ...


In [4]:
# get vocab size
raw_text = ''.join(data['full_text'].values.tolist())
print('Num of unique chars:\n', len(set(raw_text)))
del raw_text # to save memory

Num of unique chars:
 94


In [5]:
# # Byte Pair Encoder
# e = bpe.get_encoder()

In [7]:
# Tokenization

# Instantiate tokenizer and pass `gpt2` to the `from_pretrained` method 
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Select token to uses as `pad_token`
tokenizer.pad_token = tokenizer.eos_token

# Default to left padding
tokenizer.padding_side = "left"

# Process text
inputs = tokenizer(list(data.full_text), padding='longest', truncation=True,
                  return_tensors="pt", max_length=tokenizer.model_max_length)

# Update the inputs with the associated encoded labels 
inputs.update({'labels':torch.tensor(data.cohesion)})

print("input_ids\n", inputs['input_ids'])
print("attention_mask\n", inputs['attention_mask'])
print("labels\n", inputs['labels'])

input_ids
 tensor([[50256, 50256, 50256,  ...,   220,   220,   220],
        [50256, 50256, 50256,  ...,   661,   892,    13],
        [50256, 50256, 50256,  ...,   393,  4568,    13],
        ...,
        [50256, 50256, 50256,  ...,  2431,    13,   220],
        [50256, 50256, 50256,  ...,   345,     0,   220],
        [50256, 50256, 50256,  ...,   262,  6027,    13]])
attention_mask
 tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])
labels
 tensor([3.5000, 2.5000, 3.0000,  ..., 2.5000, 4.0000, 3.5000],
       dtype=torch.float64)


In [151]:
inputs

{'input_ids': tensor([[   40,   892,   326,  ..., 50256, 50256, 50256],
        [ 2215,   257,  1917,  ..., 50256, 50256, 50256],
        [20266,    11, 32641,  ..., 50256, 50256, 50256],
        ...,
        [    1,    32,  1917,  ..., 50256, 50256, 50256],
        [ 7085,   661, 12546,  ..., 50256, 50256, 50256],
        [ 5211,   345,   892,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([3.5000, 2.5000, 3.0000,  ..., 2.5000, 4.0000, 3.5000],
       dtype=torch.float64)}

In [8]:
del data

In [14]:
# Create training, validation, and test sets

n1 = int(0.8*len(inputs['input_ids']))
n2 = int(0.9*len(inputs['input_ids']))
         
Xtr = inputs['input_ids'][:n1]
Ytr = inputs['labels'][:n1]
tr_mask = inputs['attention_mask'][:n1]

Xdev = inputs['input_ids'][n1:n2]
Ydev = inputs['labels'][n1:n2]
dev_mask = inputs['attention_mask'][n1:n2]

Xte = inputs['input_ids'][n2:]
Yte = inputs['labels'][n2:]
te_mask = inputs['attention_mask'][n2:]

print(Xtr.shape, Ytr.shape)
print(Xdev.shape, Ydev.shape)
print(Xte.shape, Yte.shape)

torch.Size([3128, 1024]) torch.Size([3128])
torch.Size([391, 1024]) torch.Size([391])
torch.Size([392, 1024]) torch.Size([392])


In [15]:
class DataLoads(Dataset):
    
    def __init__(self, X, Y, Mask):
        self.x = X
        self.y = Y
        self.mask = Mask
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return {
            'input_ids':self.x[idx],
            'attention_mask':self.mask[idx],
            'labels': self.y[idx]
        }

In [27]:
del tr_mask
del Xdev
del dev_mask
del Xte
del te_mask

In [16]:
# Prepare data for input to transformer
tr_loader = DataLoads(Xtr, Ytr, tr_mask)
dev_loader = DataLoads(Xdev, Ydev, dev_mask)
te_loader = DataLoads(Xte, Yte, te_mask)

trainset = DataLoader(tr_loader, shuffle=True, batch_size=32)
devset = DataLoader(dev_loader, shuffle=False, batch_size=32)
teset = DataLoader(te_loader, shuffle=False)

In [17]:
# Instantiate configuration class to store config params for GTP2Model
model_config = GPT2Config.from_pretrained("gpt2", num_labels=9)

# Get Huggingface model
model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=model_config)

# Fix model with padding token id
model.config.pad_token_id = model.config.eos_token_id

# Load model to defined device
model.to(device)

print("Model loaded to", device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to cuda


In [25]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Model parameters
optimizer = AdamW(model.parameters(),
                 lr = 2e-5,
                 eps = 1e-8)


# Training steps (num_batches * num_epochs)
epochs = 10
train_steps = len(trainset) * epochs

# Learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = train_steps)

# Store the average loss after each epoch so we can plot them.
all_loss = {'train_loss':[], 'val_loss':[]}
all_acc = {'train_acc':[], 'val_acc':[]}



In [26]:
# Training loop functions
def train(dataloader, optimizer_, scheduler_, device_):

    # Use global variable for model.
    global model

    # Tracking variables.
    predictions_labels = []
    true_labels = []
    
    # Total loss for this epoch.
    total_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    for batch in tqdm(dataloader, total=len(dataloader)):

        # Add original labels - use later for evaluation.
        true_labels += batch['labels'].numpy().flatten().tolist()

        # move batch to device
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        # Always clear any previously calculated gradients before performing a
        # backward pass.
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this a bert model function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(**batch)

        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple along with the logits. We will use logits
        # later to calculate training accuracy.
        loss, logits = outputs[:2]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer_.step()

        # Update the learning rate.
        scheduler_.step()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()

        # Convert these logits to list of predicted labels values.
        predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    # Calculate the average loss over the training data.
    avg_epoch_loss = total_loss / len(dataloader)

    # Return all true labels and prediction for future evaluations.
    return true_labels, predictions_labels, avg_epoch_loss

In [27]:
def validation(dataloader, device_):
    
    # Use global variable for model.
    global model

    # Tracking variables
    predictions_labels = []
    true_labels = []
    #total loss for this epoch.
    total_loss = 0

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Evaluate data for one epoch
    for batch in tqdm(dataloader, total=len(dataloader)):

        # add original labels
        true_labels += batch['labels'].numpy().flatten().tolist()

        # move batch to device
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(**batch)

            # The call to `model` always returns a tuple, so we need to pull the 
            # loss value out of the tuple along with the logits. We will use logits
            # later to to calculate training accuracy.
            loss, logits = outputs[:2]

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            total_loss += loss.item()

            # get predicitons to list
            predict_content = logits.argmax(axis=-1).flatten().tolist()

            # update list
            predictions_labels += predict_content

    # Calculate the average loss over the training data.
    avg_epoch_loss = total_loss / len(dataloader)

    # Return all true labels and prediciton for future evaluations.
    return true_labels, predictions_labels, avg_epoch_loss

In [28]:
# Training loop
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score

for epoch in tqdm(range(epochs)):
    print("Training on batches...")
    train_labels, train_predict, train_loss = train(trainset, optimizer, scheduler, device)
    train_acc = accuracy_score(train_labels, train_predict)
    
    dev_labels, dev_predict, dev_loss = validation(devset, device)
    dev_acc = accuracy_score(dev_labels, dev_predict)
    
    print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%\
          (train_loss, dev_loss, train_acc, dev_acc))
    
    # Store the loss value for plotting the learning curve.
    all_loss['train_loss'].append(train_loss)
    all_loss['val_loss'].append(dev_loss)
    all_acc['train_acc'].append(train_acc)
    all_acc['val_acc'].append(dev_acc)
    
    print(f"train loss {all_loss['train_loss']}")
    print(f"val loss {all_loss['val_loss']}")
    print(f"train acc {all_loss['train_acc']}")
    print(f"val acc {all_loss['val_acc']}")

  0%|          | 0/10 [00:00<?, ?it/s]

Training on batches...


  0%|          | 0/98 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 1.50 GiB (GPU 0; 12.00 GiB total capacity; 9.46 GiB already allocated; 0 bytes free; 10.68 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [18]:
torch.cuda.empty_cache()

In [29]:
# Check GPU memory
!nvidia-smi

Tue Sep 20 10:36:49 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.15       Driver Version: 512.15       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:02:00.0  On |                  N/A |
| 34%   53C    P8    23W / 350W |  11603MiB / 12288MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [20]:
# View tensors still in use by the notebook
# Source: https://forums.fast.ai/t/gpu-memory-not-being-freed-after-training-is-over/10265?replies_to_post_number=8

def pretty_size(size):
	"""Pretty prints a torch.Size object"""
	assert(isinstance(size, torch.Size))
	return " × ".join(map(str, size))

def dump_tensors(gpu_only=True):
	"""Prints a list of the Tensors being tracked by the garbage collector."""
	import gc
	total_size = 0
	for obj in gc.get_objects():
		try:
			if torch.is_tensor(obj):
				if not gpu_only or obj.is_cuda:
					print("%s:%s%s %s" % (type(obj).__name__, 
										  " GPU" if obj.is_cuda else "",
										  " pinned" if obj.is_pinned else "",
										  pretty_size(obj.size())))
					total_size += obj.numel()
			elif hasattr(obj, "data") and torch.is_tensor(obj.data):
				if not gpu_only or obj.is_cuda:
					print("%s → %s:%s%s%s%s %s" % (type(obj).__name__, 
												   type(obj.data).__name__, 
												   " GPU" if obj.is_cuda else "",
												   " pinned" if obj.data.is_pinned else "",
												   " grad" if obj.requires_grad else "", 
												   " volatile" if obj.volatile else "",
												   pretty_size(obj.data.size())))
					total_size += obj.data.numel()
		except Exception as e:
			pass        
	print("Total size:", total_size)
    
dump_tensors()

Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Tensor: GPU pinned 1 × 1 × 1024 × 1024
Tensor: GPU pinned 
Parameter: GPU pinned 50257 × 768
Parameter: GPU pinned 1024 × 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768 × 2304
Parameter: GPU pinned 2304
Parameter: GPU pinned 768 × 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Par



In [13]:
import gc
gc.collect()
gc.get_stats()

[{'collections': 702, 'collected': 2643, 'uncollectable': 0},
 {'collections': 63, 'collected': 972, 'uncollectable': 0},
 {'collections': 6, 'collected': 366, 'uncollectable': 0}]

In [None]:
# import itertools

# class DataLoads(Dataset):
    
#     def __init__(self, X, Y):
#         self.x = X
#         self.y = Y
        
#     def __len__(self):
#         return len(self.x)
    
#     def get_vocab_size(self):
#         chars = list(itertools.chain.from_iterable(self.x[0:]))
#         return len(set(chars))
    
#     def get_block_size(self):
#         """
#         The length of the sequence that will feed into transformer,
#         containing concatenated input and the output, but -1 because the transformer
#         starts making predictions at the last input element.
#         """
#         return self.length    
    
#     def __getitem__(self, idx):
#         # inputs to the transformer
#         X = torch.tensor(self.x[idx])
#         Y = torch.tensor(self.y[idx])
#         mask = torch.ones(len(self.x[0])).float()
        
#         return X, Y, mask

In [None]:
# # Let's organize our labels for training
# data['labels'] = list(zip(data.cohesion.tolist(), data.syntax.tolist(),
#                           data.vocabulary.tolist(), data.phraseology.tolist(),
#                           data.grammar.tolist(), data.conventions.tolist()))

# data.labels = data.labels.map(lambda x: list(x))
# print(f'📐 Labels:\n {data["labels"].head()}\n')

# # Let's clean the text a bit
# data['full_text'] = data['full_text'].apply(lambda x: x.replace('\n', ' '))

# # Now, let's encode the text using BPE class
# data.text_encoded = data.full_text.map(lambda x: e.encode(x))
# print(f'🔭 Encoded text:\n {data.text_encoded.head()}')

In [None]:
# print(f'largest sequence length: {len(max(data.text_encoded))}')
# print(f'smallest sequence length: {len(min(data.text_encoded))}')

In [None]:
# # GPT2 takes a maximum of 1028 tokens. Anything above that will cause index errors.
# # So, Let's remove all samples above a given threshold
# idxs = [i for i,j in enumerate(data.text_encoded) if len(j) < 800]

# text = data.text_encoded[idxs].reset_index(drop=True)
# labels = data.cohesion[idxs].reset_index(drop=True)

In [None]:
# # ToDo: Write function that adds padding to right of sequence
# def add_pad(text):
#     N = 800 # max sequence length
#     a = text
#     b = (N - len(a))
#     a += [0] * b
#     return a, b

# text.map(lambda x: add_pad(x))[1]

In [None]:
# train_dataset = DataLoads(Xtr, Ytr)
# dev_dataset = DataLoads(Xdev, Ydev)
# test_dataset = DataLoads(Xtr, Ytr)

In [None]:
# # create a GPT instance
# from minigpt.model import GPT

# model_config = GPT.get_default_config()
# model_config.model_type = 'gpt-nano'
# model_config.vocab_size = train_dataset.get_vocab_size()
# model_config.block_size = 500
# model = GPT(model_config)

In [None]:
# # create a Trainer object
# from minigpt.trainer import Trainer

# train_config = Trainer.get_default_config()
# train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
# train_config.max_iters = 2000
# train_config.num_workers = 0
# trainer = Trainer(train_config, model, train_dataset)

In [None]:
# def batch_end_callback(trainer):
#     if trainer.iter_num % 100 == 0:
#         print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
# trainer.set_callback('on_batch_end', batch_end_callback)

# trainer.run()