In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
import os,sys
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'SemEval2021-Reading-Comprehension-of-Abstract-Meaning-master'
GOOGLE_DRIVE_PATH = os.path.join('drive','My Drive',GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
#print(os.listdir(GOOGLE_DRIVE_PATH))
sys.path.append(GOOGLE_DRIVE_PATH)

Mounted at /content/drive


In [2]:
%cd $GOOGLE_DRIVE_PATH

/content/drive/My Drive/SemEval2021-Reading-Comprehension-of-Abstract-Meaning-master


In [3]:
!pip install pytorch-lightning==1.2.3 transformers datasets

Collecting pytorch-lightning==1.2.3
  Downloading pytorch_lightning-1.2.3-py3-none-any.whl (821 kB)
[K     |████████████████████████████████| 821 kB 5.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 54.0 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 58.7 MB/s 
Collecting PyYAML!=5.4.*,>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.8 MB/s 
[?25hCollecting fsspec[http]>=0.8.1
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 59.9 MB/s 
Collecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 55.8 MB/s 
Collecting aiohttp
  Downloading aioh

# **SemvalDataModule**

This class load the data from semval Task1 and convert it to tokens. It defines the train_dataloader and val_dataloader function.

In [4]:
import pytorch_lightning as pl
import datasets
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from datasets import load_dataset
from transformers import AutoTokenizer
from typing import Optional, Dict
from functools import partial
from tqdm import tqdm


class SemvalDataModule(pl.LightningDataModule):
    def __init__(
            self,
            model_name_or_path: str = 'google/electra-large-discriminator',
            task_name: str = 'DUMA-electra',
            max_seq_length: int = 256,
            train_batch_size: int = 2,
            eval_batch_size: int = 2,
    ):
        super().__init__()
        self.model_name_or_path = model_name_or_path
        self.task_name = task_name
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)
        self.dataset = None

        # self.encoded_dataset = None

    def setup(self,stage: Optional[str] = None):
        preprocessor = partial(self.preprocess, self.tokenizer)
        if stage == 'fit':
          self.dataset = load_dataset('json', data_files={'train':'data/training_data/Task_1_train.jsonl','dev':'data/training_data/Task_2_dev.jsonl'})

          print('Encoding the training datset...')
          #print(preprocessor(self.dataset['train'][0]))
          self.dataset['train'] = self.dataset['train'].map(preprocessor)
          print('Encoding the validation datset...')
          self.dataset['dev'] = self.dataset['dev'].map(preprocessor)
          print(self.dataset)
          #print(self.dataset['dev'][0]['input_ids'])
          self.dataset['train'].set_format(type='torch',columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
          self.dataset['dev'].set_format(type='torch',columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
          print(self.dataset['dev'][0]['input_ids'])

    def train_dataloader(self):  
        return DataLoader(self.dataset['train'],
                          sampler=RandomSampler(self.dataset['train']),
                          batch_size=self.train_batch_size,
                          drop_last=True,
                          )
    def val_dataloader(self):
        return DataLoader(self.dataset['dev'],
                          sampler=RandomSampler(self.dataset['dev']),
                          batch_size=self.eval_batch_size,
                          drop_last=True,
                          )
    @staticmethod
    def preprocess(tokenizer, x: Dict)->Dict:
        
        choices_features = []
        option_names = ['option_0','option_1','option_2','option_3','option_4']
        
        question = x["question"]
        article = x["article"]

        for option in option_names:

            question_option = question.replace("@placeholder", x[option])

            inputs = tokenizer(
                article,
                question_option,
                add_special_tokens=True,
                max_length=256,
                truncation="only_first",
                padding='max_length',
                return_tensors='pt'
            )

            choices_features.append(inputs)


        label = torch.tensor([x["label"]])

        return {
            "label": label,
            "input_ids": torch.cat([cf["input_ids"] for cf in choices_features]).reshape(-1),
            "attention_mask": torch.cat([cf["attention_mask"] for cf in choices_features]).reshape(-1),
            "token_type_ids": torch.cat([cf["token_type_ids"] for cf in choices_features]).reshape(-1),
        }

    
    

# **DUMAForSemval**

In [5]:
import pytorch_lightning as pl
from transformers.modeling_outputs import MultipleChoiceModelOutput
from transformers import AutoConfig,AutoModel
from transformers import AdamW
import torch
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.nn import MultiheadAttention

class DUMAForSemval(pl.LightningModule):
    def __init__(
            self,
            pretrained_model: str = 'google/electra-large-discriminator',
            learning_rate: float = 1e-4,
            gradient_accumulation_steps: int = 32,
            num_train_epochs: float = 4.0,
            train_batch_size: int = 2,
            train_all: bool = False,
    ):
        super().__init__()
        self.config = AutoConfig.from_pretrained(pretrained_model)
        self.electra = AutoModel.from_pretrained(pretrained_model,config=self.config)
        self.mamc = MultiheadAttention(self.config.hidden_size, self.config.num_attention_heads)
        self.dropouts = nn.ModuleList([
            nn.Dropout(0.5) for _ in range(5)
        ])
        self.classifier = nn.Linear(self.config.hidden_size, 1)

        if not train_all:
            for param in self.electra.parameters():
                param.requires_grad = False


        self.learning_rate = learning_rate
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.num_train_epochs = num_train_epochs
        self.train_batch_size = train_batch_size

    def forward(
            self,
            input_ids=None, #(batch_size,num_choices,sequence_length:256)
            attention_mask=None,
            token_type_ids=None,
            labels=None,
    ):

        input_ids = input_ids.reshape(self.train_batch_size,5,-1)
        attention_mask = attention_mask.reshape(self.train_batch_size,5,-1)
        token_type_ids = token_type_ids.reshape(self.train_batch_size,5,-1)

        #print(input_ids)
        #print(input_ids.shape)

        num_choices = input_ids.shape[1] 


        input_ids = input_ids.view(-1, input_ids.size(-1)) #(batch_size*num_choice,sequence_length:256)
        #print(input_ids)
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) 
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) 

        outputs = self.electra(
            input_ids = input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        last_output = outputs.last_hidden_state #(batch_size, sequence_length:256, hidden_size:256)
        # qa_seq_output, p_seq_output, qa_mask, p_mask = separate_seq2(last_output, input_ids)
        fused_output,weight = self.mamc(last_output, last_output, last_output)
        pooled_output = torch.mean(fused_output, dim=1)
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.classifier(dropout(pooled_output))
            else:
                logits += self.classifier(dropout(pooled_output))
        logits = logits / len(self.dropouts)
        #reshaped_logits = F.softmax(logits.view(-1, num_choices), dim=1)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(reshaped_logits, labels)

        
        # output = (reshaped_logits,) + outputs[2:]
        # return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


    def training_step(self, batch, batch_idx):
        # input training batch, calling DUMA forward() function
        # return loss
        outputs = self(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            token_type_ids=batch['token_type_ids'],
            labels=batch['label'],
        )
        labels_hat = torch.argmax(outputs.logits, dim=1)
        correct_count = torch.sum(batch['label'] == labels_hat)
        loss = outputs.loss
        self.log('train_loss', loss)
        self.log('train_acc', correct_count.float() / len(batch['label']))
        #print('train_acc',correct_count.float() / len(batch['label']))

        return loss

    def validation_step(self, batch, batch_idx):
      # input validation batch, calling DUMA forward() function
      # return loss
        outputs = self(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            token_type_ids=batch['token_type_ids'],
            labels=batch['label'],
        )
        labels_hat = torch.argmax(outputs.logits, dim=1)
        correct_count = torch.sum(batch['label'] == labels_hat)
        loss = outputs.loss

        return {
            "val_loss": loss,
            "correct_count": correct_count,
            "batch_size": len(batch['label'])
        }
    def validation_epoch_end(self, outputs) -> None:
        val_acc = sum([out["correct_count"] for out in outputs]).float() / sum(out["batch_size"] for out in outputs)
        val_loss = sum([out["val_loss"] for out in outputs]) / len(outputs)
        self.log('val_acc', val_acc)
        self.log('val_loss', val_loss)
        print('val_loss', val_loss)
        print('val_acc', val_acc)
    
    def configure_optimizers(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        return AdamW(optimizer_grouped_parameters, lr=self.learning_rate)

# **Trainer**

In [6]:

# For ELECTRA + DUMA
model_name = 'google/electra-large-discriminator'
model = DUMAForSemval(
        pretrained_model= model_name,
        learning_rate=1e-4,
        num_train_epochs=1.0,
        train_batch_size=2,
        train_all=False,
    )
data = SemvalDataModule(
        model_name_or_path= model_name,
        train_batch_size=2,
        eval_batch_size=2,
        max_seq_length=256,
    )
trainer = pl.Trainer(
        gpus= 1 ,
        #auto_scale_batch_size='power',
        #auto_lr_find=True,
        max_epochs=1,
        val_check_interval=0.2,
    )
trainer.fit(model, data)
trainer.save_checkpoint('MAMC_task2/')

Downloading:   0%|          | 0.00/668 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Using custom data configuration default-2f654cf92693300a


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-2f654cf92693300a/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-2f654cf92693300a/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Encoding the training datset...


  0%|          | 0/3227 [00:00<?, ?ex/s]

Encoding the validation datset...


  0%|          | 0/851 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'question', 'option_0', 'option_1', 'option_2', 'option_3', 'option_4', 'label', 'input_ids', 'attention_mask', 'token_type_ids'],
        num_rows: 3227
    })
    dev: Dataset({
        features: ['article', 'question', 'option_0', 'option_1', 'option_2', 'option_3', 'option_4', 'label', 'input_ids', 'attention_mask', 'token_type_ids'],
        num_rows: 851
    })
})
tensor([  101, 11382, 23169,  ...,     0,     0,     0])



  | Name       | Type               | Params
--------------------------------------------------
0 | electra    | ElectraModel       | 334 M 
1 | mamc       | MultiheadAttention | 4.2 M 
2 | dropouts   | ModuleList         | 0     
3 | classifier | Linear             | 1.0 K 
--------------------------------------------------
4.2 M     Trainable params
334 M     Non-trainable params
338 M     Total params
1,353.167 Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

val_loss tensor(1.6094, device='cuda:0')
val_acc tensor(0.7500, device='cuda:0')




Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

val_loss tensor(1.6087, device='cuda:0')
val_acc tensor(0.8847, device='cuda:0')


Validating: 0it [00:00, ?it/s]

val_loss tensor(1.6054, device='cuda:0')
val_acc tensor(0.8906, device='cuda:0')


Validating: 0it [00:00, ?it/s]

val_loss tensor(1.5849, device='cuda:0')
val_acc tensor(0.8800, device='cuda:0')


Validating: 0it [00:00, ?it/s]

val_loss tensor(1.4191, device='cuda:0')
val_acc tensor(0.8906, device='cuda:0')


Validating: 0it [00:00, ?it/s]

val_loss tensor(1.0331, device='cuda:0')
val_acc tensor(0.8918, device='cuda:0')
