# Multiple Choice Question Answering - CommonsenseQA

> Fine-tuning Pre-trained Language Models on Multiple Choice Question Answering for Commonsense Reasoning 

> Based on HuggingFace Transformers

> Chaehyeong Kim, CONVEI Lab 

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
#pip install transformers command

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting command
  Downloading Command-0.1.0.tar.gz (5.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: command
  Building wheel for command (setup.py) ..

In [None]:
import transformers
transformers.logging.set_verbosity_error()

### Load Library

In [None]:
import os
import json
import shutil
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, PreTrainedModel
from transformers import AdamW

from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

### Set Hyperparameters

In [None]:
DEVICE = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cpu')

In [None]:
#DATA_DIR = '/home/chaehyeong/nas2/TA-Materials/data/commonsenseqa'
#OUTPUT_DIR = '/home/chaehyeong/nas2/TA-Materials/checkpoint/commonsenseqa'
#MODEL_NAME = 'roberta-base'

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
N_EPOCH = 10
BATCH_SIZE = 16
LEARNING_RATE = 1e-6

### Load Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

### Load Data

In [19]:
#with open(os.path.join(DATA_DIR, 'train.json'), 'r') as f:
#    json_train = json.load(f)
#json_train
with open(os.path.join('train.json'), 'r') as f:
    json_train = json.load(f)
json_train

[{'id': '075e483d21c29a511267ef62bedc0461',
  'context': None,
  'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?',
  'question_concept': 'punishing',
  'answer': 'A',
  'choices': [{'text': 'ignore', 'label': 'A'},
   {'text': 'enforce', 'label': 'B'},
   {'text': 'authoritarian', 'label': 'C'},
   {'text': 'yell at', 'label': 'D'},
   {'text': 'avoid', 'label': 'E'}]},
 {'id': '61fe6e879ff18686d7552425a36344c8',
  'context': None,
  'question': 'Sammy wanted to go to where the people were.  Where might he go?',
  'question_concept': 'people',
  'answer': 'B',
  'choices': [{'text': 'race track', 'label': 'A'},
   {'text': 'populated areas', 'label': 'B'},
   {'text': 'the desert', 'label': 'C'},
   {'text': 'apartment', 'label': 'D'},
   {'text': 'roadblock', 'label': 'E'}]},
 {'id': '4c1cb0e95b99f72d55c068ba0255c54d',
  'context': None,
  'question': 'To locate a choker not located in a jewelry bo

In [20]:
TARGET_NAMES = ['A', 'B', 'C', 'D', 'E']
LABEL_TO_LETTER = {i:v for i, v in enumerate(TARGET_NAMES)}
print(LABEL_TO_LETTER)
LETTER_TO_LABEL = {v:k for k, v in LABEL_TO_LETTER.items()}
print(LETTER_TO_LABEL)

{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}
{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}


### Define Dataset

In [21]:
class CommonsenseqaDataset(Dataset):
    def __init__(self, tokenizer, data):
        self.tokenizer = tokenizer
        self.x = []; self.y = []
        for sample in data:
            input_answers = [c['text'] for c in sample['choices']]
            num_choices = len(input_answers)
            input_context_question = [sample['question']] * num_choices
            encoded_text = self.tokenizer(input_context_question, input_answers, padding=True, return_tensors='pt')
            self.x.append(encoded_text)
            input_label = LETTER_TO_LABEL[sample['answer']]
            self.y.append(torch.tensor(input_label))

    def __getitem__(self, idx):
        item = {
            'input_ids':self.x[idx]['input_ids'],
            'attention_mask':self.x[idx]['attention_mask'],
            'labels':self.y[idx]
        }
        return item
             
    def __len__(self):
        return len(self.y)

In [22]:
train_data, valid_data = train_test_split(json_train, train_size=0.7, random_state=42)
train_dataset = CommonsenseqaDataset(tokenizer, train_data)
valid_dataset = CommonsenseqaDataset(tokenizer, valid_data)

In [23]:
def prepare_batch(batch):
    """
    This collate function will pad the batch to be the same length. This requires
    flattening, then unflattening for the multiple choice format.
    One example will be a list of length 'num choices', each element being a list
    of (encoded) tokens representing qustion/answer [sep] choice
    """
    # batch: [batch_size, (text, label)]
    batch_size = len(batch)
    num_choices = len(batch[0]['input_ids'])
    
    # flatten
    labels_features = []
    input_ids_features = []
    attention_mask_features = []
    max_len = 0
    for b in batch:
        labels_features.append(b['labels'])
        for i in range(num_choices):
            input_ids_features.append(b['input_ids'][i])
            attention_mask_features.append(b['attention_mask'][i])
            if b['input_ids'][i].shape[0] > max_len:
                max_len = b['input_ids'][i].shape[0]
    # flattened_features list length num_choices*batch_size

    # padding
    padded_input_ids_features = []
    padded_attention_mask_features = []
    for input_ids, attention_mask in zip(input_ids_features, attention_mask_features):
        pad_len = max_len - input_ids.shape[0]
        if pad_len > 0:
            padded_input_ids = torch.cat([input_ids, torch.LongTensor([0] * pad_len)])
            padded_attention_mask = torch.cat([attention_mask, torch.LongTensor([0] * pad_len)])
            padded_input_ids_features.append(padded_input_ids)
            padded_attention_mask_features.append(padded_attention_mask)
        else:
            padded_input_ids_features.append(input_ids)
            padded_attention_mask_features.append(attention_mask)

    # un-flatten
    batch = {}
    batch['input_ids'] = torch.stack(padded_input_ids_features).view(batch_size, num_choices, -1)
    batch['attention_mask'] = torch.stack(padded_attention_mask_features).view(batch_size, num_choices, -1)
    batch['labels'] = torch.stack(labels_features).view(batch_size) 
    return batch

In [24]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda batch: prepare_batch(batch))
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda batch: prepare_batch(batch))

In [25]:
b = next(iter(train_loader))
print(b['input_ids'].size())
print(b['attention_mask'].size())
print(b['labels'].size())

torch.Size([16, 5, 31])
torch.Size([16, 5, 31])
torch.Size([16])


### Load Model

In [26]:
class Multiple_Choice_Model(nn.Module):
    def __init__(self, model: PreTrainedModel, dropout: float = None):
          super(Multiple_Choice_Model, self).__init__()
          self.model = model
          self.dropout = nn.Dropout(self.model.config.hidden_dropout_prob)
          self.classifier = nn.Linear(self.model.config.hidden_size, 1)
   
    def forward(self, input_ids: torch.tensor, attention_mask: torch.tensor, labels=None):
        num_choices = input_ids.shape[1]
          
        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) # size : [batch_size*num_choices, seq_len]
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) 

        outputs = self.model(
            input_ids = flat_input_ids, 
            attention_mask = flat_attention_mask,
            )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(reshaped_logits, labels)

        return loss, reshaped_logits

In [27]:
pretrained_model = AutoModel.from_pretrained(MODEL_NAME)
model = Multiple_Choice_Model(pretrained_model)

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [28]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

### Train Model

In [29]:
class EarlyStopping(object):
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=10, verbose=True, delta=0, trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved. Default: 10
            verbose (bool): If True, prints a message for each validation loss improvement. Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement. Default: 0
            trace_func (function): trace print function. Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.trace_func = trace_func
    def __call__(self, val_loss, model, path):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model, path)
            self.counter = 0
    def save_checkpoint(self, val_loss, model, path):
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        # Save model checkpoint
        torch.save(model.state_dict(), path)
        self.val_loss_min = val_loss

In [30]:
class Trainer(object):
    """
    Trainer for training a multiple choice classification model.
    """
    def __init__(self, model, optimizer, device='cpu'):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.device = device

    def _print_summary(self):
        print(self.model)
        print(self.optimizer)

    def train(self, loader):
        """
        Run a single epoch of training
        """
        self.model.train() # Run model in training mode

        epoch_true_labels = []
        epoch_preds = []
        epoch_loss = 0
        for batch in tqdm(loader):
            # clear gradient
            self.optimizer.zero_grad()
            # input_ids shape: (batch_size, num_choices, sequence_length)
            input_ids = batch['input_ids'].to(self.device)
            # attention_mask shape: (batch_size, num_choices, sequence_length)
            attention_mask = batch['attention_mask'].to(self.device)
            # labels shape: (batch_size, )
            labels = batch['labels'].to(self.device)
            
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits = outputs[0], outputs[1]
            preds = torch.argmax(nn.Softmax(dim=1)(logits), dim=1)
            
            epoch_true_labels.extend(labels.tolist())
            epoch_preds.extend(preds.tolist())
            epoch_loss += loss.item()
            
            # back propagation
            loss.backward()
            # do gradient descent
            self.optimizer.step()

            torch.cuda.empty_cache()
        return epoch_loss / len(loader), epoch_true_labels, epoch_preds

    def evaluate(self, loader):
        """
        Evaluate the model on a validation set.
        """
        self.model.eval() # Run model in eval mode (disables dropout layer)

        epoch_true_labels = []
        epoch_preds = []
        epoch_loss = 0
        with torch.no_grad(): # Disable gradient computation - required only during training
            for batch in tqdm(loader):
                # input_ids shape: (batch_size, num_choices, sequence_length)
                input_ids = batch['input_ids'].to(self.device)
                # attention_mask shape: (batch_size, num_choices, sequence_length)
                attention_mask = batch['attention_mask'].to(self.device)
                # labels shape: (batch_size, )
                labels = batch['labels'].to(self.device)

                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss, logits = outputs[0], outputs[1]
                preds = torch.argmax(nn.Softmax(dim=1)(logits), dim=1)
                
                epoch_true_labels.extend(labels.tolist())
                epoch_preds.extend(preds.tolist())
                epoch_loss += loss.item()

                torch.cuda.empty_cache()
        return epoch_loss/len(loader), epoch_true_labels, epoch_preds

    def get_model_dict(self):
        return self.model.state_dict()

    def run_training(self, train_loader, valid_loader, target_names):
        early_stopping = EarlyStopping(patience=5, verbose=True)

        for i in range(N_EPOCH):
            train_epoch_loss, train_labels, train_preds = self.train(train_loader)
            valid_epoch_loss, valid_labels, valid_preds = self.evaluate(valid_loader)
            
            print(f"Epoch {i}")
            print(f"Train loss: {train_epoch_loss}")
            print(f"Valid loss: {valid_epoch_loss}")
            print("Train eval")
            print(classification_report(train_labels, train_preds, target_names=target_names))
            print("Valid eval")
            print(classification_report(valid_labels, valid_preds, target_names=target_names))
            
            valid_f1 = f1_score(valid_labels, valid_preds, average='macro')
            model_name = 'bs{}-lr{}-epoch{}-f1{:.04f}.pt'.format(BATCH_SIZE, LEARNING_RATE, i+1, valid_f1)
            model_path = os.path.join(OUTPUT_DIR, model_name)

            early_stopping(valid_epoch_loss, self.model, model_path)
            if early_stopping.early_stop:
                print("Early stopping")              
                break

            torch.cuda.empty_cache()

In [None]:
trainer = Trainer(model, optimizer, DEVICE)
trainer.run_training(train_loader, valid_loader, TARGET_NAMES)

  0%|          | 0/427 [00:00<?, ?it/s]

### Evaluate Model

In [None]:
with open(os.path.join(DATA_DIR, 'dev.json'), 'r') as f:
    json_dev = json.load(f)
test_dataset = CommonsenseqaDataset(tokenizer, json_dev)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
# load trained model
best_path = '/home/chaehyeong/nas2/TA-Materials/checkpoint/commonsenseqa/bs16-lr1e-06-epoch10-f10.5477.pt'
restore_dict = torch.load(best_path, map_location=DEVICE)
model.load_state_dict(restore_dict)

<All keys matched successfully>

In [None]:
model.eval()
test_gths = []; test_preds = []
with torch.no_grad(): 
    for idx, batch in tqdm(enumerate(test_loader)):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs[0], outputs[1]
        preds = torch.argmax(nn.Softmax(dim=1)(logits), dim=1)
        test_gths.extend(labels.tolist())
        test_preds.extend(preds.tolist())
        torch.cuda.empty_cache()

1221it [00:22, 54.30it/s]


In [None]:
results = []
for i, sample in enumerate(json_dev):
    sample['prediction'] = LABEL_TO_LETTER[test_preds[i]]
    results.append(sample)
with open(os.path.join(OUTPUT_DIR, 'results.json'), 'w') as f:
    json.dump(results, f, indent=4)

In [None]:
metrics = {
    'accuracy':accuracy_score(test_gths, test_preds),
    'f1':f1_score(test_gths, test_preds, average='macro'),
    'precision':precision_score(test_gths, test_preds, average='macro'),
    'recall':recall_score(test_gths, test_preds, average='macro'),  
}
with open(os.path.join(OUTPUT_DIR, 'metrics.json'), 'w') as f:
    json.dump(metrics, f, indent=4)

: 