In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Followed tutorial on QA here
# https://huggingface.co/course/chapter7/7?fw=tf

In [None]:
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import collections
from transformers import DefaultDataCollator, GPT2TokenizerFast, AutoConfig, GPT2Model,Trainer, TrainingArguments
from transformers import create_optimizer
import tensorflow as tf
from transformers.keras_callbacks import PushToHubCallback
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter

In [None]:
model_checkpoint = "gpt2"
tokenizer = GPT2TokenizerFast.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
!pip install ipdb
import ipdb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
max_length = 384
stride = 128

def preprocess_training_examples(examples):

    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
  
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
            if (len(sequence_ids) <= idx):
              break
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def preprocess_validation_examples(examples):
  questions = [q.strip() for q in examples["question"]]
  inputs = tokenizer(
      questions,
      examples["context"],
      max_length=max_length,
      truncation="only_second",
      stride=stride,
      return_overflowing_tokens=True,
      return_offsets_mapping=True,
      padding="max_length",
  )

  sample_map = inputs.pop("overflow_to_sample_mapping")
  example_ids = []

  for i in range(len(inputs["input_ids"])):
      sample_idx = sample_map[i]
      example_ids.append(examples["id"][sample_idx])

      sequence_ids = inputs.sequence_ids(i)
      offset = inputs["offset_mapping"][i]
      inputs["offset_mapping"][i] = [
          o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
      ]

  inputs["example_id"] = example_ids
  return inputs

In [None]:
squad = load_dataset("squad")
squad

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
train_dataset = squad["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=squad["train"].column_names,
)


  0%|          | 0/88 [00:00<?, ?ba/s]

In [None]:
squad["validation"]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [None]:
# Using validation as test set
test_dataset = squad["validation"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=squad["validation"].column_names,
)


  0%|          | 0/11 [00:00<?, ?ba/s]

In [None]:
# Using validation as test set
test_dataset2 = squad["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["validation"].column_names,
)

  0%|          | 0/11 [00:00<?, ?ba/s]

In [None]:
test_dataset[0]['start_positions']

45

In [None]:
import numpy as np
class QAData(Dataset):
    def __init__(self, dataframe, ):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe[index]
        ids, mask, ss, en= row["input_ids"], row["attention_mask"], row["start_positions"], row["end_positions"]
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'start_positions': torch.tensor(ss, dtype=torch.long),
            'end_positions': torch.tensor(en, dtype=torch.long)
        }

In [None]:
training_set = QAData(train_dataset)
testing_set = QAData(test_dataset)

In [None]:
class GPT2ForQuestionAnswering(GPT2Model):
    def __init__(self, config):
        super(GPT2ForQuestionAnswering, self).__init__(config)
        self.num_labels = config.num_labels

        self.gpt2 = GPT2Model(config)
        self.qa_outputs = torch.nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
        self, 
        input_ids=None, 
        attention_mask=None, 
        token_type_ids=None, 
        position_ids=None, 
        head_mask=None, 
        inputs_embeds=None, 
        start_positions=None, 
        end_positions=None,
    ):

        outputs = self.gpt2(
            input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids, 
            position_ids=position_ids, 
            head_mask=head_mask, 
            inputs_embeds=inputs_embeds, 
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output) # The line I don't understand
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        outputs = (start_logits, end_logits,) + outputs[2:]
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            outputs = (total_loss,) + outputs

        return outputs 
# (loss), start_logits, end_logits, (hidden_states), (attentions)

In [None]:
config = AutoConfig.from_pretrained("gpt2")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config.num_labels

2

In [None]:
model = GPT2ForQuestionAnswering(config)
model.to(device)

GPT2ForQuestionAnswering(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e

In [None]:
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 1

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

train_loader = DataLoader(training_set, **train_params)
test_loader = DataLoader(testing_set, **val_params)

In [None]:
EPOCHS = 1

optimizer = torch.optim.Adam(params =  model.parameters(), lr=5e-5)

In [None]:
for i, b in enumerate(train_loader):
  if(i==0):
    print(b.keys())
  break

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])


In [None]:
# Function for a single training iteration
def train_epoch(model, training_loader, optimizer, device):
    model = model.train()
    total_train_loss = 0
    
    for b in tqdm(training_loader):

        ids = b['input_ids'].to(device)
        mask = b['attention_mask'].to(device)
        start_pos = b['start_positions'].to(device)
        end_pos = b['end_positions'].to(device)
        
        model.zero_grad()
        loss, s_log, e_log = outputs = model(
            input_ids= ids,
            attention_mask= mask,
            start_positions= start_pos, 
            end_positions= end_pos, 
        )
        
        total_train_loss += loss.item()
        
        # Backward prop
        loss.backward()
        
        # Gradient Descent
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        #scheduler.step()
        optimizer.zero_grad()
    
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(training_loader)
    
    return avg_train_loss

In [None]:
%%time
import warnings
warnings.filterwarnings("ignore")

from collections import defaultdict
    
# Show details 
print(f"Epoch {1}/{EPOCHS}")
print("-" * 10)

train_loss = train_epoch(
  model,
  train_loader,
  optimizer,
  device,
)

print(f"Train loss {train_loss}")

torch.save(model.state_dict(), 'GPT2_QA_Model')

Epoch 1/1
----------


  0%|          | 0/5532 [00:00<?, ?it/s]

Train loss 4.093972316877789
CPU times: user 28min 38s, sys: 5.43 s, total: 28min 43s
Wall time: 28min 35s


In [None]:
# runtime 28min 35s

In [None]:
# save model

from google.colab import drive
drive.mount('/content/gdrive')

PATH = F"/content/gdrive/My Drive/Models/GPT2_QA_Model"
torch.save(model.state_dict(), PATH)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

PATH = F"/content/gdrive/My Drive/Models/GPT2_QA_Model"

model.load_state_dict(torch.load(PATH))

Mounted at /content/gdrive


<All keys matched successfully>

In [None]:
def eval_model(model, test_loader, device):
    model = model.eval()
    
    start_logits = []
    end_logits = []
    
    with torch.no_grad():
        for b in test_loader:
            ids = b['input_ids'].to(device)
            mask = b['attention_mask'].to(device)
            start_pos = b['start_positions'].to(device)
            end_pos = b['end_positions'].to(device)
            
            model.zero_grad()
            loss, start_logit, end_logit = outputs = model(
                input_ids= ids,
                attention_mask= mask,
                start_positions= start_pos, 
                end_positions= end_pos, 
            )
            start_logit = start_logit.detach().cpu().numpy()
            end_logit = end_logit.detach().cpu().numpy()
            
            start_logits.append(start_logit)
            end_logits.append(end_logit)

    return {
            'start_logits': start_logits,
            'end_logits': end_logits
        }

In [None]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 5.1 MB/s 
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [None]:
import evaluate
from tqdm.auto import tqdm
import numpy as np 

n_best =20
max_answer_length=30


metric = evaluate.load("squad")

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            
            start_logit = start_logits[feature_index][0]
            end_logit = end_logits[feature_index][0]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            start_indexes = start_indexes
            end_indexes = end_indexes
            
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [None]:
predictions = eval_model(
  model,
  test_loader,
  device,
)

In [None]:
print(len(predictions['start_logits'][0][0]))

384


In [None]:
compute_metrics(
    predictions["start_logits"],
    predictions["end_logits"],
    test_dataset2,
    squad["validation"],
)
# {'exact_match': 9.602649006622517, 'f1': 17.45337067684064}

  0%|          | 0/10570 [00:00<?, ?it/s]

{'exact_match': 9.602649006622517, 'f1': 17.45337067684064}

In [None]:
#{'exact_match': 78.30652790917692, 'f1': 86.34045898023999}

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=context)

The model 'GPT2ForQuestionAnswering' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuestionAnswering', 'DebertaV2ForQuestionAnswering', 'DistilBertForQuestionAnswering', 'ElectraForQuestionAnswering', 'ErnieForQuestionAnswering', 'FlaubertForQuestionAnsweringSimple', 'FNetForQuestionAnswering', 'FunnelForQuestionAnswering', 'GPTJForQuestionAnswering', 'IBertForQuestionAnswering', 'LayoutLMv2ForQuestionAnswering', 'LayoutLMv3ForQuestionAnswering', 'LEDForQuestionAnswering', 'LiltForQuestionAnswering', 'LongformerForQuestionAnswering', 'LukeForQuestionAnswering', 'LxmertForQuestionAnswering', 'MarkupLMForQuestionAnswering', 'MBartForQuestionAnswering'

RuntimeError: ignored