Imports

Main resource: https://www.topbots.com/gpt2-text-classification-using-hugging-face-transformers/

In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import pyarrow
if int(pyarrow.__version__.split('.')[1]) < 16 and int(pyarrow.__version__.split('.')[0]) == 0:
    import os
    os.kill(os.getpid(), 9)
from sklearn.metrics import accuracy_score
from datasets import list_datasets, list_metrics, load_dataset_builder, load_dataset, load_metric, Dataset
from transformers import AutoTokenizer, set_seed

In [None]:
set_seed(1111)
torch.backends.cudnn.deterministic = True

In [None]:
import tensorflow as tf

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
We will use the GPU: Tesla T4


# Load and Preprocess Dataset

In [None]:
mnli = load_dataset('multi_nli', split='train')
mnli = pd.DataFrame(mnli)



In [None]:
test = mnli[mnli['genre'] == 'telephone']
train = mnli[mnli['genre'] != 'telephone']

In [None]:

test = test.loc[:, ['premise', 'hypothesis', 'label']]
test = test.iloc[0:1600,:]
test.head(3)

Unnamed: 0,premise,hypothesis,label
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,0
4,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,1
5,my walkman broke so i'm upset now i just have ...,I'm upset that my walkman broke and now I have...,0


In [None]:
train_df = train.loc[:, ['premise', 'hypothesis', 'label']]
train_df = train_df.iloc[0:8000,:]
train_df.head(3)

Unnamed: 0,premise,hypothesis,label
0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,1
2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,0
3,How do you know? All this is their information...,This information belongs to them.,0


In [None]:
train = train_df.sample(frac=.8)
val = train_df.drop(train.index)
print(train.shape)
print(val.shape)

(6400, 3)
(1600, 3)


In [None]:
# add special tokens
train['premise'] = train['premise'] + ' [SEP] '
val['premise'] =  val['premise'] + ' [SEP] '
test['premise'] =  test['premise'] + ' [SEP] '
train['hypothesis'] = train['hypothesis'] 
val['hypothesis'] = val['hypothesis'] 
test['hypothesis'] = test['hypothesis'] 

# combine premise & hypothesis
train['sequence'] = train['premise'] + train['hypothesis']
test['sequence'] = test['premise'] + test['hypothesis']
val['sequence'] = val['premise'] + val['hypothesis']


In [None]:
train = train.loc[:,['label', 'sequence']]
test = test.loc[:,['label', 'sequence']]
val = val.loc[:,['label', 'sequence']]

In [None]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
val = Dataset.from_pandas(val)

In [None]:
train = train.remove_columns('__index_level_0__')
train[0:5]

{'label': [2, 0, 0, 0, 0],
 'sequence': ["But don't think this lets those bastards at Delta off the hook. [SEP] Delta are not bastards.",
  "He's just a boy. Rennie did not utter that as an excuse; rather he said it as if to reassure himself. [SEP] Rennie told himself that the person was just a boy.",
  "America's children are particularly affected by poverty. [SEP] American's children are very affected by poverty ",
  "In addition, contrary to recent assertions, we are not seeking the minutes of these meetings or related notes of the Vice President's staff. [SEP] To add to that, contrary to recent assertions, we are not seeking the minutes of these meetings or related notes of the Vice President's staff.",
  'Second, several special rate arrangements would be discontinued. [SEP] Many special rate arrangements would be stopped.']}

In [None]:
test = test.remove_columns('__index_level_0__')
test[0:5]

{'label': [0, 1, 0, 0, 0],
 'sequence': ['you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him [SEP] You lose the things to the following level if the people recall.',
  "yeah i tell you what though if you go price some of those tennis shoes i can see why now you know they're getting up in the hundred dollar range [SEP] The tennis shoes have a range of prices.",
  "my walkman broke so i'm upset now i just have to turn the stereo up real loud [SEP] I'm upset that my walkman broke and now I have to turn the stereo up really loud.",
  'well you see that on television also [SEP] You can see that on television, as well.',
  "well it's been very interesting [SEP] It has been very intriguing."]}

In [None]:
val = val.remove_columns('__index_level_0__')
val[0:5]

{'label': [0, 0, 0, 2, 2],
 'sequence': ['How do you know? All this is their information again. [SEP] This information belongs to them.',
  'I burst through a set of cabin doors, and fell to the ground- [SEP] I burst through the doors and fell down.',
  'Issues in Data Synthesis. [SEP] Problems in data synthesis.',
  'Vrenna and I both fought him and he nearly took us. [SEP] Neither Vrenna nor myself have ever fought him.',
  'This analysis pooled estimates from these two studies to develop a C-R function linking PM to chronic bronchitis. [SEP] The analysis proves that there is no link between PM and bronchitis.']}

Tokenize Dataset

In [None]:
from transformers import GPT2Tokenizer, GPT2Config, set_seed, TrainingArguments,Trainer, AdamW, get_linear_schedule_with_warmup, GPT2ForSequenceClassification
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

In [None]:
class MNLIDataset(Dataset):
  r"""PyTorch Dataset class for loading data.
 
  This is where the data parsing happens.
 
  This class is built with reusability in mind: it can be used as is as.
 
  Arguments:
 
    path (:obj:`str`):
        Path to the data partition.
 
  """
 
  def __init__(self, df, use_tokenizer):
 
    self.dataframe = df
    self.text = self.dataframe['sequence']
    self.label = self.dataframe['label']
 
    # Number of exmaples.
    self.n_examples = len(self.dataframe)
     
 
    return
 
  def __len__(self):
    r"""When used `len` return the number of examples.
 
    """
     
    return self.n_examples
 
  def __getitem__(self, idx):
    r"""Given an index return an example from the position.
     
    Arguments:
 
      idx (:obj:`int`):
          Index position to pick an example to return.
 
    Returns:
      :obj:`Dict[str, str]`: Dictionary of inputs that contain text and 
      asociated labels.
 
    """
    if torch.is_tensor(idx):
            idx = idx.tolist()

 
    return {'text': self.text[idx],
            'label': self.label[idx]}

In [None]:
class Gpt2ClassificationCollator(object):
    r"""
    Data Collator used for GPT2 in a classificaiton rask. 
     
    It uses a given tokenizer and label encoder to convert any text and labels 
    to numbers that can go straight into a GPT2 model.
 
    This class is built with reusability in mind: it can be used as is as long
    as the `dataloader` outputs a batch in dictionary format that can be passed 
    straight into the model - `model(**batch)`.
 
    Arguments:
 
      use_tokenizer (:obj:`transformers.tokenization_?`):
          Transformer type tokenizer used to process raw text into numbers.
 
      labels_ids (:obj:`dict`):
          Dictionary to encode any labels names into numbers. Keys map to 
          labels names and Values map to number associated to those labels.
 
      max_sequence_len (:obj:`int`, `optional`)
          Value to indicate the maximum desired sequence to truncate or pad text
          sequences. If no value is passed it will used maximum sequence size
          supported by the tokenizer and model.
 
    """
 
    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):
 
        # Tokenizer to be used inside the class.
        self.use_tokenizer = use_tokenizer
        # Check max sequence length.
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        # Label encoder used inside the class.
        self.labels_encoder = labels_encoder
 
        return
 
    def __call__(self, sequences):
        r"""
        This function allowes the class objesct to be used as a function call.
        Sine the PyTorch DataLoader needs a collator function, I can use this 
        class as a function.
 
        Arguments:
 
          item (:obj:`list`):
              List of texts and labels.
 
        Returns:
          :obj:`Dict[str, object]`: Dictionary of inputs that feed into the model.
          It holddes the statement `model(**Returned Dictionary)`.
        """
        # Get all texts from sequences list.
        texts = [sequence['text'] for sequence in sequences]
        # Get all labels from sequences list.
        labels = [sequence['label'] for sequence in sequences]
        # Encode all labels using label encoder.
        # labels = [self.labels_encoder[label] for label in labels]
        # Call tokenizer on all texts to convert into tensors of numbers with 
        # appropriate padding.
        inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
        # Update the inputs with the associated encoded labels as tensor.
        inputs.update({'labels':torch.tensor(labels)})
 
        return inputs

In [None]:
def train_model(dataloader, optimizer_, scheduler_, device_):
  r"""
  Train pytorch model on a single pass through the data loader.
 
  It will use the global variable `model` which is the transformer model 
  loaded on `_device` that we want to train on.
 
  This function is built with reusability in mind: it can be used as is as long
    as the `dataloader` outputs a batch in dictionary format that can be passed 
    straight into the model - `model(**batch)`.
 
  Arguments:
 
      dataloader (:obj:`torch.utils.data.dataloader.DataLoader`):
          Parsed data into batches of tensors.
 
      optimizer_ (:obj:`transformers.optimization.AdamW`):
          Optimizer used for training.
 
      scheduler_ (:obj:`torch.optim.lr_scheduler.LambdaLR`):
          PyTorch scheduler.
 
      device_ (:obj:`torch.device`):
          Device used to load tensors before feeding to model.
 
  Returns:
 
      :obj:`List[List[int], List[int], float]`: List of [True Labels, Predicted
        Labels, Train Average Loss].
  """
 
  # Use global variable for model.
  global model
 
  # Tracking variables.
  predictions_labels = []
  true_labels = []
  # Total loss for this epoch.
  total_loss = 0
  total_acc = 0
 
  # Put the model into training mode.
  model.train()
 
  # For each batch of training data...
  for batch in tqdm(dataloader, total=len(dataloader)):
 
    # Add original labels - use later for evaluation.
    true_labels += batch['labels'].numpy().flatten().tolist()
     
    # move batch to device
    batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
     
    # Always clear any previously calculated gradients before performing a
    # backward pass.
    model.zero_grad()
 
    # Perform a forward pass (evaluate the model on this training batch).
    # This will return the loss (rather than the model output) because we
    # have provided the `labels`.
    # The documentation for this a bert model function is here: 
    # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
    outputs = model(**batch)
 
    # The call to `model` always returns a tuple, so we need to pull the 
    # loss value out of the tuple along with the logits. We will use logits
    # later to calculate training accuracy.
    loss, logits = outputs[:2]
 
    # Accumulate the training loss over all of the batches so that we can
    # calculate the average loss at the end. `loss` is a Tensor containing a
    # single value; the `.item()` function just returns the Python value 
    # from the tensor.
    total_loss += loss.item()
 
    # Perform a backward pass to calculate the gradients.
    loss.backward()
 
    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
 
    # Update parameters and take a step using the computed gradient.
    # The optimizer dictates the "update rule"--how the parameters are
    # modified based on their gradients, the learning rate, etc.
    optimizer_.step()
 
    # Update the learning rate.
    scheduler_.step()
 
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
 
    # Convert these logits to list of predicted labels values.
    predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    # Calculate accuracy
    acc = accuracy_score(predictions_labels, true_labels)

    # Accumulate the accuracy over the batches
    total_acc += acc.item()
 
  # Calculate the average loss over the training data.
  avg_epoch_loss = total_loss / len(dataloader)
  avg_epoch_acc = total_acc /len(dataloader)
   
  # Return all true labels and prediction for future evaluations.
  return true_labels, predictions_labels, avg_epoch_loss, avg_epoch_acc

In [None]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from torchmetrics.classification import MulticlassCalibrationError
def validation(dataloader, device_):
  r"""Validation function to evaluate model performance on a 
  separate set of data.
 
  This function will return the true and predicted labels so we can use later
  to evaluate the model's performance.
 
  This function is built with reusability in mind: it can be used as is as long
    as the `dataloader` outputs a batch in dictionary format that can be passed 
    straight into the model - `model(**batch)`.
 
  Arguments:
 
    dataloader (:obj:`torch.utils.data.dataloader.DataLoader`):
          Parsed data into batches of tensors.
 
    device_ (:obj:`torch.device`):
          Device used to load tensors before feeding to model.
 
  Returns:
     
    :obj:`List[List[int], List[int], float]`: List of [True Labels, Predicted
        Labels, Train Average Loss]
  """
 
  # Use global variable for model.
  global model
 
  # Tracking variables
  predictions_labels = []
  predictions_logits = []
  true_labels = []
  #total loss for this epoch.
  total_loss = 0
  total_acc = 0
  total_ece = 0
 
  # Put the model in evaluation mode--the dropout layers behave differently
  # during evaluation.
  model.eval()
 
  # Evaluate data for one epoch
  for batch in tqdm(dataloader, total=len(dataloader)):
 
    # add original labels
    true_labels += batch['labels'].numpy().flatten().tolist()
 
    # move batch to device
    batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
 
    # Telling the model not to compute or store gradients, saving memory and
    # speeding up validation
    with torch.no_grad():        
 
        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have
        # not provided labels.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # sformers/v2.https://huggingface.co/tran2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(**batch)
 
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple along with the logits. We will use logits
        # later to to calculate training accuracy.
        loss, logits = outputs[:2]
         
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
 
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()
         
        # get predictions to list
        predict_content = logits.argmax(axis=-1).flatten().tolist()
 
        # update list
        predictions_labels += predict_content

        # update logits
        predictions_logits.append(logits)

        # Accuracy
        acc = accuracy_score(predictions_labels, true_labels)
        total_acc += acc.item()

        # ECE
        metric = MulticlassCalibrationError(num_classes=3, n_bins=10, norm='l1')
        pred_logits = np.array(predictions_logits)
        pred_logits = np.reshape(pred_logits, (pred_logits.shape[0]*pred_logits.shape[1], 3))
        preds = torch.Tensor(pred_logits)
        true = torch.Tensor(true_labels)
        ece_score = metric(preds, true)
        total_ece += ece_score.item()
 
  # Calculate the average loss over the training data.
  avg_epoch_loss = total_loss / len(dataloader)
  avg_epoch_acc = total_acc / len(dataloader)
  avg_epoch_ece = total_ece / len(dataloader)
  
 
  # Return all true labels and prediciton for future evaluations.
  return true_labels, predictions_labels, avg_epoch_loss, avg_epoch_acc, avg_epoch_ece

# Load GPT2 Model

In [None]:
model_name_or_path = 'gpt2'
max_length = None
batch_size = 10
epochs = 3
labels_ids = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
n_labels = len(labels_ids)

In [None]:
# Get model configuration.
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

# Get the actual model.
print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)
 
# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))
 
# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id
 
# Load model to defined device.
model.to(device)

Loading model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid

In [None]:
# Create data collator to encode text and labels into numbers.
gpt2_classification_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer, 
                                                          labels_encoder=labels_ids, 
                                                          max_sequence_len=max_length)

train_df = MNLIDataset(train, use_tokenizer=tokenizer)
test_df = MNLIDataset(test, use_tokenizer=tokenizer)
val_df = MNLIDataset(val, use_tokenizer=tokenizer)

train_dataloader = DataLoader(train_df, batch_size=batch_size, shuffle=True, 
                              collate_fn=gpt2_classification_collator)
validation_dataloader = DataLoader(val_df, batch_size=batch_size, shuffle=True, 
                                   collate_fn=gpt2_classification_collator)
test_dataloader = DataLoader(test_df, batch_size=batch_size, shuffle=True, 
                             collate_fn=gpt2_classification_collator)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # default is 1e-8.
                  )
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
 
# Store the average loss after each epoch so we can plot them.
all_loss = {'train_loss':[], 'val_loss':[]}
all_acc = {'train_acc':[], 'val_acc':[]}

best_valid_loss = float('inf')

# Loop through each epoch.
print('Epoch')
for epoch in tqdm(range(epochs)):
  print()
  print('Training on batches...')
  # Perform one full pass over the training set.
  train_labels, train_predict, train_loss, train_acc = train_model(train_dataloader, optimizer, scheduler, device)
 
  # Get prediction form model on validation data. 
  print('Validation on batches...')
  valid_labels, valid_predict, val_loss, val_acc, val_ece = validation(validation_dataloader, device)
 
  if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        torch.save(model.state_dict(), 'gpt2-nli.pt')
  # Print loss and accuracy values to see how training evolves.
  print("  train_loss: %.5f - train_acc: %.5f - val_loss: %.5f - val_acc: %.5f - val_ece: %.5f"%(train_loss, train_acc, val_loss, val_acc, val_ece))
  print()
 
  # Store the loss value for plotting the learning curve.
  all_loss['train_loss'].append(train_loss)
  all_loss['val_loss'].append(val_loss)
  all_acc['train_acc'].append(train_acc)
  all_acc['val_acc'].append(val_acc)


Epoch




  0%|          | 0/3 [00:00<?, ?it/s]


Training on batches...


  0%|          | 0/640 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/160 [00:00<?, ?it/s]

  train_loss: 1.16086 - train_acc: 0.33810 - val_loss: 1.09574 - val_acc: 0.38130 - val_ece: 0.04163


Training on batches...


  0%|          | 0/640 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/160 [00:00<?, ?it/s]

  train_loss: 1.10335 - train_acc: 0.36992 - val_loss: 1.08477 - val_acc: 0.38117 - val_ece: 0.02236


Training on batches...


  0%|          | 0/640 [00:00<?, ?it/s]

Validation on batches...


  0%|          | 0/160 [00:00<?, ?it/s]

  train_loss: 1.09259 - train_acc: 0.36458 - val_loss: 1.08172 - val_acc: 0.38759 - val_ece: 0.02889



In [None]:
# Get prediction form model on validation data. This is where you should use
# your test data.
true_labels, predictions_labels, avg_epoch_loss, avg_epoch_acc, avg_epoch_ece = validation(test_dataloader, device)
print("test_loss: %.5f - test_acc: %.5f - test_ece: %.5f"%(avg_epoch_loss, avg_epoch_acc, avg_epoch_ece))

  0%|          | 0/160 [00:00<?, ?it/s]

test_loss: 1.08970 - test_acc: 0.34721 - test_ece: 0.03103
