# GPT2 for Text Classification

In [11]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from transformers import GPT2Tokenizer, GPT2Config, GPT2ForSequenceClassification

set_seed(3407)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
# Let's first check our GPU's memory
!nvidia-smi

Tue Sep 20 18:55:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.15       Driver Version: 512.15       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:02:00.0  On |                  N/A |
|100%   75C    P2   334W / 350W |   4216MiB / 12288MiB |     55%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [17]:
# Load dataset
import pandas as pd
data = pd.read_csv('data/train.csv')[:1955]
text = data.full_text
print(text[0][:200] + '...')

I think that students would benefit from learning at home,because they wont have to change and get up early in the morning to shower and do there hair. taking only classes helps them because at there ...


In [18]:
# Get vocab size
raw_text = ''.join(data['full_text'].values.tolist())
print('Num of unique chars:\n', len(set(raw_text)))
del raw_text # to save memory
del text

Num of unique chars:
 89


In [20]:
# Tokenization

# Instantiate tokenizer and pass `gpt2` to the `from_pretrained` method 
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Select token to uses as `pad_token`
tokenizer.pad_token = tokenizer.eos_token

# Default to left padding
tokenizer.padding_side = "left"

# Process text
inputs = tokenizer(list(data.full_text), padding='longest', truncation=True,
                  return_tensors="pt", max_length=tokenizer.model_max_length)

# Update the inputs with the associated encoded labels 
inputs.update({'labels':torch.tensor(data.cohesion)})

print("input_ids\n", inputs['input_ids'][:5])
print("attention_mask\n", inputs['attention_mask'][:5])
print("labels\n", inputs['labels'][:5])

input_ids
 tensor([[50256, 50256, 50256,  ...,   220,   220,   220],
        [50256, 50256, 50256,  ...,   661,   892,    13],
        [50256, 50256, 50256,  ...,   393,  4568,    13],
        [50256, 50256, 50256,  ..., 29340,    13,   220],
        [50256, 50256, 50256,  ...,   220,   220,   220]])
attention_mask
 tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])
labels
 tensor([3.5000, 2.5000, 3.0000, 4.5000, 2.5000], dtype=torch.float64)


In [21]:
# Create training, validation, and test sets

n1 = int(0.8*len(inputs['input_ids']))
n2 = int(0.9*len(inputs['input_ids']))
         
Xtr = inputs['input_ids'][:n1]
Ytr = inputs['labels'][:n1]
tr_mask = inputs['attention_mask'][:n1]

Xdev = inputs['input_ids'][n1:n2]
Ydev = inputs['labels'][n1:n2]
dev_mask = inputs['attention_mask'][n1:n2]

Xte = inputs['input_ids'][n2:]
Yte = inputs['labels'][n2:]
te_mask = inputs['attention_mask'][n2:]

print(Xtr.shape, Ytr.shape)
print(Xdev.shape, Ydev.shape)
print(Xte.shape, Yte.shape)

torch.Size([1564, 1024]) torch.Size([1564])
torch.Size([195, 1024]) torch.Size([195])
torch.Size([196, 1024]) torch.Size([196])


In [22]:
class DataLoads(Dataset):
    
    def __init__(self, X, Y, Mask):
        self.x = X
        self.y = Y
        self.mask = Mask
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return {
            'input_ids':self.x[idx],
            'attention_mask':self.mask[idx],
            'labels': self.y[idx]
        }

In [23]:
# Prepare data for input to transformer
tr_loader = DataLoads(Xtr, Ytr, tr_mask)
dev_loader = DataLoads(Xdev, Ydev, dev_mask)
te_loader = DataLoads(Xte, Yte, te_mask)

trainset = DataLoader(tr_loader, shuffle=True, batch_size=3)
devset = DataLoader(dev_loader, shuffle=False, batch_size=3)
teset = DataLoader(te_loader, shuffle=False, batch_size=3)

In [24]:
# Instantiate configuration class to store config params for GTP2Model
model_config = GPT2Config.from_pretrained("gpt2", num_labels=9)

# Get Huggingface model
model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=model_config)

# Fix model with padding token id
model.config.pad_token_id = model.config.eos_token_id

# Load model to defined device
model.to(device)

n_params = sum(p.numel() for p in model.parameters())
print("number of parameters: %.2fM" % (n_params/1e6,))

print("Model loaded to", device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


number of parameters: 124.45M
Model loaded to cuda


In [25]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Model parameters
optimizer = torch.optim.AdamW(model.parameters(),
                 lr = 2e-5,
                 eps = 1e-8)


# Training steps (num_batches * num_epochs)
epochs = 4
train_steps = len(trainset) * epochs

# Learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = train_steps)

# Store the average loss after each epoch so we can plot them.
all_loss = {'train_loss':[], 'val_loss':[]}
all_acc = {'train_acc':[], 'val_acc':[]}

In [26]:
# Training loop functions
def train(dataloader, optimizer_, scheduler_, device_):

    # Use global variable for model.
    global model

    # Tracking variables.
    predictions_labels = []
    true_labels = []
    
    # Total loss for this epoch.
    total_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    for batch in tqdm(dataloader, total=len(dataloader)):

        # Add original labels - use later for evaluation.
        true_labels += batch['labels'].numpy().flatten().tolist()

        # move batch to device
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        # Always clear any previously calculated gradients before performing a
        # backward pass.
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this a bert model function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(**batch)

        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple along with the logits. We will use logits
        # later to calculate training accuracy.
        loss, logits = outputs[:2]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer_.step()

        # Update the learning rate.
        scheduler_.step()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()

        # Convert these logits to list of predicted labels values.
        predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    # Calculate the average loss over the training data.
    avg_epoch_loss = total_loss / len(dataloader)

    # Return all true labels and prediction for future evaluations.
    return true_labels, predictions_labels, avg_epoch_loss

In [27]:
def validation(dataloader, device_):
    
    # Use global variable for model.
    global model

    # Tracking variables
    predictions_labels = []
    true_labels = []
    #total loss for this epoch.
    total_loss = 0

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Evaluate data for one epoch
    for batch in tqdm(dataloader, total=len(dataloader)):

        # add original labels
        true_labels += batch['labels'].numpy().flatten().tolist()

        # move batch to device
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(**batch)

            # The call to `model` always returns a tuple, so we need to pull the 
            # loss value out of the tuple along with the logits. We will use logits
            # later to to calculate training accuracy.
            loss, logits = outputs[:2]

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            total_loss += loss.item()

            # get predicitons to list
            predict_content = logits.argmax(axis=-1).flatten().tolist()

            # update list
            predictions_labels += predict_content

    # Calculate the average loss over the training data.
    avg_epoch_loss = total_loss / len(dataloader)

    # Return all true labels and prediciton for future evaluations.
    return true_labels, predictions_labels, avg_epoch_loss

In [28]:
# Training loop
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score

for epoch in tqdm(range(epochs)):
    print("Training on batches...")
    train_labels, train_predict, train_loss = train(trainset, optimizer, scheduler, device)
#     train_acc = accuracy_score(train_labels, train_predict)
    
    dev_labels, dev_predict, dev_loss = validation(devset, device)
#     dev_acc = accuracy_score(dev_labels, dev_predict)

    print("train_loss: %.5f - val_loss: %.5f", (train_loss, dev_loss))
#     print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%\
#           (train_loss, dev_loss, train_acc, dev_acc))
    
    # Store the loss value for plotting the learning curve.
    all_loss['train_loss'].append(train_loss)
    all_loss['val_loss'].append(dev_loss)
#     all_acc['train_acc'].append(train_acc)
#     all_acc['val_acc'].append(dev_acc)
    
    print(f"train loss {all_loss['train_loss']}")
    print(f"val loss {all_loss['val_loss']}")
#     print(f"train acc {all_loss['train_acc']}")
#     print(f"val acc {all_loss['val_acc']}")

  0%|          | 0/4 [00:00<?, ?it/s]

Training on batches...


  0%|          | 0/522 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

train_loss: %.5f - val_loss: %.5f (1.2362767130536139, 1.0320717790952096)
train loss [1.2362767130536139]
val loss [1.0320717790952096]
Training on batches...


  0%|          | 0/522 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

train_loss: %.5f - val_loss: %.5f (1.079670857481116, 0.9747170241979453)
train loss [1.2362767130536139, 1.079670857481116]
val loss [1.0320717790952096, 0.9747170241979453]
Training on batches...


  0%|          | 0/522 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

train_loss: %.5f - val_loss: %.5f (1.0269048695820044, 0.9537950552426852)
train loss [1.2362767130536139, 1.079670857481116, 1.0269048695820044]
val loss [1.0320717790952096, 0.9747170241979453, 0.9537950552426852]
Training on batches...


  0%|          | 0/522 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

train_loss: %.5f - val_loss: %.5f (0.9464822969617058, 0.9867872591202076)
train loss [1.2362767130536139, 1.079670857481116, 1.0269048695820044, 0.9464822969617058]
val loss [1.0320717790952096, 0.9747170241979453, 0.9537950552426852, 0.9867872591202076]


In [29]:
torch.cuda.empty_cache()

In [31]:
# Save the model's state dictionary
PATH =r"C:\Users\Hedronstone\Desktop\miniGPT2\state_dicts\cohesion.pt"
torch.save(model.state_dict(), PATH)