# Mount Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/My Drive/Colab Notebooks/QQP-PAWS'

# Install libraries

In [0]:
!bash setup.bash
!pip install transformers

# QQP→QQP

In [0]:
!python ./transformers/utils/download_glue_data.py \
    --data_dir=data_glue/ \
    --tasks=QQP

In [0]:
!python ./transformers/examples/run_glue.py \
    --data_dir=./data_glue/QQP/ \
    --model_type=bert \
    --model_name_or_path="bert-base-uncased" \
    --do_lower_case \
    --task_name=qqp \
    --do_train \
    --do_eval \
    --output_dir=./result/qqp_qqp \
    --overwrite_output_dir \
    --num_train_epochs=3 \
    --per_gpu_train_batch_size=64 \
    --per_gpu_eval_batch_size=64 \
    --save_steps=5000 \

# https://arxiv.org/abs/1904.01130

# QQP→PAWSQQP

In [0]:
from transformers import DataProcessor, InputExample
import os
class PawsQqpProcessor(DataProcessor):
    """Processor for the Paws QQP data set."""

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["question1"].numpy().decode("utf-8"),
            tensor_dict["question2"].numpy().decode("utf-8"),
            str(tensor_dict["label"].numpy()),
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_and_test.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            try:
                text_a = line[1]
                text_b = line[2]
                label = line[3]
            except IndexError:
                continue
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

In [0]:
from transformers import glue_convert_examples_to_features as convert_examples_to_features
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import logging
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    try:
        logger.info("Load PAWS QQP data")
    except:
        logger = logging.getLogger(__name__)
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = PawsQqpProcessor()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
            str(task),
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        examples = (
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset

In [0]:
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
from transformers import glue_output_modes as output_modes
import argparse
import torch
import sys
sys.path.append("./transformers/examples")
from run_glue import evaluate

config_class, model_class, tokenizer_class = BertConfig, BertForSequenceClassification, BertTokenizer
tokenizer = tokenizer_class.from_pretrained("./result/qqp_qqp/", do_lower_case=True)
model = model_class.from_pretrained("./result/qqp_qqp/")

args = argparse.Namespace(
    output_dir="./result/qqp_pawsqqp",
    task_name="qqp",
    model_type="bert",
    data_dir="./data_paws/paws_qqp/output",
    model_name_or_path="bert-base-uncased",
    overwrite_cache=False,
    local_rank=-1,
    max_seq_length=128,
    per_gpu_eval_batch_size=64,
    n_gpu=1,
    )
args.output_mode = output_modes["qqp"]
args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(args.device)
model.eval()
with torch.no_grad():
    load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
    result = evaluate(args, model, tokenizer)
    print("\n", result)

# QQP+PAWSQQP→PAWSQQP

In [0]:
from run_glue import train

# add default parameters
args.max_steps = -1
args.gradient_accumulation_steps = 1
args.learning_rate = 5e-05
args.adam_epsilon = 1e-08
args.warmup_steps = 0
args.weight_decay = 0.0
args.max_grad_norm = 1.0
args.logging_steps = 500
args.fp16 = False
args.seed = 42
args.evaluate_during_training = False

# Fine-tuning
args.output_dir = "./result/pawsqqp_pawsqqp"
args.num_train_epochs = 3
args.per_gpu_train_batch_size = 64
args.save_steps = 5000
model.train()
train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
print("\n", "global_step = %s, average loss = %s"%(global_step, tr_loss))

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = (
    model.module if hasattr(model, "module") else model
)  # Take care of distributed/parallel training
model_to_save.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
# Good practice: save your training arguments together with the trained model
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

model.eval()
with torch.no_grad():
    result = evaluate(args, model, tokenizer)
    print("\n", result)