In [1]:
'''
This python file processes sentences.
'''

import spacy
import jsonlines

def get_sent_data(raw_text, clean_text=True):
    """Given a passage, return sentences"""
    
    nlp = spacy.load("en_core_web_sm", disable=["ner", "tagger", "lemmatizer", "attribute_ruler"])
    sent_data = []
    raw_text = raw_text.replace('\n', '')
    raw_text_list = [str(sent_obj).strip() for sent_obj in nlp(raw_text).sents] 
    raw_text_list = unify_sentences([x for x in raw_text_list if len(x) > 1])
    sent_data = ' [CLS] '.join(raw_text_list)
    return sent_data

def unify_sentences(sents, length=10):
    """unifies short sentences."""
    i = 0
    while i < len(sents) - 1:
        if len(sents[i]) < length or len(sents[i+1]) < length:
            sents[i] = sents[i] + " " + sents[i+1]
            sents.pop(i+1)
        else:
            i += 1
    return sents

In [2]:
# import jsonlines
# from tqdm.auto import tqdm
# import json
# import os

# for mode in ["train", "dev"]:
#     jsonl_file = f'/workspace/quality/data/v1.0.1/QuALITY.v1.0.1.htmlstripped.{mode}'
#     with jsonlines.open(jsonl_file, 'r') as jsonl_f:
#         lst = [obj for obj in jsonl_f]
#         results = []
#         for file in tqdm(lst):
#             context = get_sent_data(file['article'])
#             for question in file['questions']:
#                 temp = {}
#                 temp['context'] = context
#                 temp['question'] = question['question']
#                 temp['options'] = question['options'] 
#                 temp['label'] = question['gold_label']
#                 results.append(temp)
                
#         # write to jsonlines file
#         with open(f'../data/{mode}.jsonl', 'w') as outfile:
#             for d in results:
#                 outfile.write(json.dumps(d) + "\n")

In [3]:
############### TRAINING CODE ################
from transformers import AutoTokenizer, AutoModel
import jsonlines 
from datasets import load_dataset

# Option 1. Divide into 512 tokens, and just do it.
# Option 2. Add Sentence Special Tokens like [CLS], and do something
# Option 3. Add Paragraph Special Token i.e. in front of every 512 tokens.

# Current Implementation: Option 1.
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Build Dataset
dset = load_dataset('json', data_files="/workspace/new_quality/data/train.jsonl")
dset['valid'] = load_dataset('json', data_files="/workspace/new_quality/data/dev.jsonl")['train']

Using custom data configuration default-50929fa4e6cf828c
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-50929fa4e6cf828c/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-218a2199685916a9
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-218a2199685916a9/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
def convert_to_2d_list(elem, chunk_size, pad_id):
    """converts tokens into chunks of 512."""

    lst = elem["input_ids"]
    att_lst = elem["attention_mask"]

    num_lists = len(lst) // chunk_size
    remainder = len(lst) % chunk_size
    two_d_list = [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)][:num_lists]
    if remainder:
        last_list = lst[-remainder:] + [pad_id] * (chunk_size - remainder)
        two_d_list.append(last_list)

    # Need to check!
    two_d_att_list = [[1 for _ in range(chunk_size)] for i in range(num_lists)]
    if remainder:
        last_list = [1 for _ in range(chunk_size - remainder)] + [0 for _ in range(remainder)]
        two_d_att_list.append(last_list)

    return {"context_input_ids": two_d_list, "context_attention_mask": two_d_att_list}

In [5]:
import torch

def custom_tokenize(ex):
    # CONTEXT, Make it into chuncks of 512
    text = ex['context'].replace(" [CLS] ", " ")
    result = convert_to_2d_list(tokenizer(text), chunk_size=512, pad_id=tokenizer.pad_token_id)

    for k in result:
        ex[k] = result[k]

    # Question options.
    q = tokenizer(ex['question'], padding='max_length', max_length=128)
    o = tokenizer(ex['options'], padding='max_length', max_length=128)
    
    for k in q:
        ex["qo_" + k] = torch.cat([torch.Tensor([q[k]]), torch.Tensor(o[k])])

    ex['label'] = ex['label'] - 1
    return ex

# Tokenize text in each dataset.
for mode in dset:
    dset[mode] = dset[mode].map(lambda x: custom_tokenize(x))

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-50929fa4e6cf828c/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-44f01b7ca2f26374.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-218a2199685916a9/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-35dc6f316b9bf2df.arrow


In [6]:
# Divide into 512 tokens.
import torch.nn as nn
from CoCA import CrossAttention
import pdb

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()

        # [N, 512]
        self.model = AutoModel.from_pretrained(model_name)

        # [5, M]
        self.post_model = AutoModel.from_pretrained(model_name)

        self.ca_q = CrossAttention(dim=768)
        self.ca_a = CrossAttention(dim=768)
        self.mhatt = nn.MultiheadAttention(embed_dim=768, num_heads=8)

        self.nn = nn.Sequential(
            nn.Linear(768, 768),
            nn.Tanh(),
            nn.Dropout(0.1),
            nn.Linear(768, 1)
        )

    def forward(self, context_input_ids, context_attention_mask, qo_input_ids, qo_attention_mask):
        """context: [N, 512], question_options: [5, M]"""
        context = self.model(input_ids=context_input_ids, attention_mask=context_attention_mask)['last_hidden_state']
        query = self.post_model(input_ids=qo_input_ids, attention_mask=qo_attention_mask)['last_hidden_state']

        # Question = [1, A, 768]
        # Options = [4, A, 768]
        question = query[0,].unsqueeze(0)
        options = query[1:,]

        # Cross attention, Output: [1, 512, 768]
        att_context = self.ca_q(context, question)
        att_context = att_context.mean(dim=0).unsqueeze(0)
                
        # Option: [4, 128, 768] # attention: [1, 512, 768]
        result = self.ca_a(att_context, options) # => [4, 512, 768]
        result = result.mean(dim=1) # Mean Pooling => [4, 768]
        result = self.nn(result).squeeze(-1) # [4, 768]

        return result

In [7]:
import random
import numpy as np
import torch

def simple_accuracy(preds, labels):
    return (preds == labels).mean()

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

In [8]:
class Args():
    def __init__(self):
        self.local_rank = -1
        self.output_dir = "/workspace/new_quality/output"
        self.per_gpu_train_batch_size = 8
        self.n_gpu = 1
        self.max_steps = -1
        self.gradient_accumulation_steps = 8
        self.num_train_epochs = 10
        self.learning_rate= 3e-5
        self.adam_betas = "(0.9, 0.999)"
        self.adam_epsilon = 1e-8
        self.weight_decay = 0
        self.warmup_steps = 0
        self.warmup_proportion = 0

        self.fp16 = False
        self.fp16_opt_level = "O1"
        self.num_train_epochs= 10
        self.seed = 42

        self.gradient_accumulation_steps = 16
        self.no_clip_grad_norm = True
        self.max_grad_norm = 1.0

        self.logging_steps = 100
        self.save_steps = 1000

In [9]:
import argparse
import glob
import logging
from multiprocessing.spawn import import_main_path
import os
import random

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertForMultipleChoice,
    BertTokenizer,
    RobertaConfig,
    RobertaForMultipleChoice,
    RobertaTokenizer,
    DebertaV2Config,
    # DebertaV2ForMultipleChoice,
    DebertaV2Tokenizer,
    XLNetConfig,
    XLNetForMultipleChoice,
    XLNetTokenizer,
    get_linear_schedule_with_warmup,
)

try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter


import logging

logging.basicConfig(filename='./logs/train.log', level=logging.DEBUG)
logger = logging.getLogger(__name__)
device = torch.device("cuda:0")

def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    
    if args.local_rank in [-1, 0]:
        str_list = str(args.output_dir).split('/')
        tb_log_dir = os.path.join('summaries', str_list[-1])
        tb_writer = SummaryWriter(tb_log_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:  # XXX
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    exec('args.adam_betas = ' + args.adam_betas)
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, betas=args.adam_betas, eps=args.adam_epsilon)
    assert not ((args.warmup_steps > 0) and (args.warmup_proportion > 0)), "--only can set one of --warmup_steps and --warm_ratio "
    if args.warmup_proportion > 0:
        args.warmup_steps = int(t_total * args.warmup_proportion)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    # if args.n_gpu > 1:
    #     model = torch.nn.DataParallel(model)

    # # Distributed training (should be after apex fp16 initialization)
    # if args.local_rank != -1:   # XXX
    #     model = torch.nn.parallel.DistributedDataParallel(
    #         model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
    #     )
    
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    # UNTIL HERE

    def evaluate_model(train_preds, train_label_ids, tb_writer, args, model, tokenizer, best_steps, best_dev_acc):
        train_acc = simple_accuracy(train_preds, train_label_ids)
        train_preds = None
        train_label_ids = None
        
        results = evaluate(args, model, tokenizer, dset["valid"])

        ### UNTIL HERE ###

        logger.info(
            "train acc: %s, dev acc: %s, loss: %s, global steps: %s",
            str(train_acc),
            str(results["eval_acc"]),
            str(results["eval_loss"]),
            str(global_step),
        )

        tb_writer.add_scalar("training/acc", train_acc, global_step)

        for key, value in results.items():
            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
        if results["eval_acc"] > best_dev_acc:
            best_dev_acc = results["eval_acc"]
            best_steps = global_step
            logger.info("achieve BEST dev acc: %s at global step: %s",
                        str(best_dev_acc),
                        str(best_steps)
            )

            output_dir = args.output_dir
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            
            torch.save(args, os.path.join(output_dir, "training_args.bin"))
            logger.info("Saving model checkpoint to %s", output_dir)
            txt_dir = os.path.join(output_dir, 'best_dev_results.txt')
            with open(txt_dir, 'w') as f:
                rs = 'global_steps: {}; dev_acc: {}'.format(global_step, best_dev_acc)
                f.write(rs)  
                tb_writer.add_text('best_results', rs, global_step)

        return train_preds, train_label_ids, train_acc, best_steps, best_dev_acc

    def save_model(args, model, tokenizer):
        output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_vocabulary(output_dir)
        tokenizer.save_pretrained(output_dir)
        torch.save(args, os.path.join(output_dir, "training_args.bin"))
        logger.info("Saving model checkpoint to %s", output_dir)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_dev_acc = 0.0
    best_steps = 0

    train_preds = []
    train_label_ids = []
    criterion = nn.CrossEntropyLoss()

    model.zero_grad()
    train_iterator = range(int(args.num_train_epochs))
    set_seed(args)  # Added here for reproductibility

    model.to(device)

    for _ in train_iterator:
        epoch_iterator = tqdm(dset["train"], desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()

            # by default, run evaluation ONLY in device 0.
            inputs = {}

            for k in ["context_input_ids", "context_attention_mask", "qo_input_ids", "qo_attention_mask"]:
                inputs[k]= torch.LongTensor(batch[k]).to(device)
            
            label = torch.LongTensor([batch['label']]).to(device)
            outputs = model(**inputs)
                    
            loss = criterion(outputs.unsqueeze(0), label)
            loss = loss / args.gradient_accumulation_steps
            
            train_preds = np.append(train_preds, outputs.argmax(dim=-1).detach().cpu().numpy())
            train_label_ids = np.append(train_label_ids, label.detach().cpu().numpy())
            
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                if not args.no_clip_grad_norm:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
                if not args.no_clip_grad_norm:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            tr_loss += loss.item()
            epoch_iterator.set_postfix(loss=loss.item())

            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    train_preds, train_label_ids, train_acc, best_steps, best_dev_acc = evaluate_model(train_preds, train_label_ids, tb_writer, args, model, tokenizer, best_steps, best_dev_acc)
                    tb_writer.add_scalar("training/lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("training/loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logger.info(
                        "Average loss: %s, average acc: %s at global step: %s",
                        str((tr_loss - logging_loss) / args.logging_steps),
                        str(train_acc),
                        str(global_step),
                    )
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    save_model(args, model, tokenizer)
            if args.max_steps > 0 and global_step > args.max_steps: # XXX
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps: # XXX
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        train_preds, train_label_ids, train_acc, best_steps, best_dev_acc = evaluate_model(train_preds, train_label_ids, tb_writer, args, model, tokenizer, best_steps, best_dev_acc)
        save_model(args, model, tokenizer)
        tb_writer.close()

    return global_step, tr_loss / global_step, best_steps
    

### EVALUATE ###
def evaluate(args, model, tokenizer, eval_dataset, prefix="", test=False,):

    eval_output_dir = args.output_dir
    criterion = nn.CrossEntropyLoss()

    results = {}
    os.makedirs(eval_output_dir, exist_ok=True)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))

    eval_loss = 0.0
    nb_eval_steps = 0

    preds = None
    out_label_ids = None

    for batch in tqdm(eval_dataset, desc="Evaluating"):
        model.eval()

        # by default, run evaluation ONLY in device 0.
        inputs = {}

        for k in ["context_input_ids", "context_attention_mask", "qo_input_ids", "qo_attention_mask"]:
            inputs[k]= torch.LongTensor(batch[k]).to(device)
        
        label = torch.LongTensor([batch['label']]).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
                
        loss = criterion(outputs.unsqueeze(0), label)
        eval_loss = loss.item()
        
        preds = np.append(preds, outputs.argmax(dim=-1).detach().cpu().numpy())
        out_label_ids = np.append(out_label_ids, label.detach().cpu().numpy())

        nb_eval_steps += 1

    # FROM HERE.
    eval_loss = eval_loss / nb_eval_steps
    acc = simple_accuracy(preds, out_label_ids)

    result = {"eval_acc": acc, "eval_loss": eval_loss}
    results.update(result)

    output_eval_file = os.path.join(eval_output_dir, "is_test_" + str(test).lower() + "_eval_results.txt")

    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test)))

        if not test:    
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    if test:
        return results, preds
    else:
        return results


In [None]:
args = Args()
device = torch.device("cuda:0")
model = Network().to(device)
train(args, dset["train"], model, tokenizer)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaMod