In [1]:
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np

In [2]:
from transformers import AutoConfig, AutoModelForSequenceClassification, BertForNextSentencePrediction, \
        AutoTokenizer, EvalPrediction, SuperGlueDataset, BertForNLI

In [3]:
from transformers import SuperGlueDataTrainingArguments as DataTrainingArguments
from transformers import HfArgumentParser, Trainer, TrainingArguments
from transformers import superglue_compute_metrics, superglue_output_modes, superglue_tasks_num_labels, set_seed

In [4]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
logger = logging.getLogger(__name__)

MODEL_NAME = "bert-base-cased"
DATESTAMP = "20200805"
SUPER_GLUE_DIR = "/home/keyur/medhas/superglue_data/"
TASK_NAME = "RTE"
PER_DEVICE_BATCH_SIZE = 48
EXPERIMENT_DIR="/mnt/data/medhas/glue_experiments/%s/%s"%(MODEL_NAME, DATESTAMP)

custom_sysargv = [
"--model_name_or_path=%s"%MODEL_NAME,
"--task_name=%s"%TASK_NAME,
"--do_train",
"--do_eval",
"--data_dir=%s"%os.path.join(SUPER_GLUE_DIR, TASK_NAME),
"--max_seq_length=256",
"--per_device_train_batch_size=%s"%PER_DEVICE_BATCH_SIZE,
"--learning_rate=10e-5",
"--num_train_epochs=10",
"--output_dir=%s"%os.path.join(EXPERIMENT_DIR, TASK_NAME),
"--logging_dir=%s/logs"%os.path.join(EXPERIMENT_DIR, TASK_NAME),
"--logging_steps=26",
"--evaluate_during_training",
"--eval_step=26",
"--save_total_limit=2",
"--save_steps=1000",
"--gradient_accumulation_steps=1",
"--overwrite_output_dir"
]

model_args, data_args, training_args = parser.parse_args_into_dataclasses(args=custom_sysargv)

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.WARN if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

set_seed(training_args.seed)
training_args.seed
num_labels = superglue_tasks_num_labels[data_args.task_name]
output_mode = superglue_output_modes[data_args.task_name]
print ("Task:", data_args.task_name, "Labels:", num_labels, ', Output', output_mode)




Task: rte Labels: 2 , Output classification


In [5]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else     model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
)
#model = BertForNLI.from_pretrained(model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir)

- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_dataset = SuperGlueDataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
eval_dataset = SuperGlueDataset(data_args, tokenizer=tokenizer, mode="dev") if training_args.do_eval else None
test_dataset = SuperGlueDataset(data_args, tokenizer=tokenizer, mode="test") if training_args.do_predict else None

In [9]:
def compute_metrics(p: EvalPrediction) -> Dict:
    if output_mode == "classification":
        preds = np.argmax(p.predictions, axis=1)
    elif output_mode == "regression":
        preds = np.squeeze(p.predictions)
    return superglue_compute_metrics(data_args.task_name, preds, p.label_ids)

In [10]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [11]:
if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=26.0, style=ProgressStyle(description_wid…



{'loss': 0.7023925368602459, 'learning_rate': 1.8e-05, 'epoch': 1.0, 'step': 26}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.672224779923757, 'eval_acc': 0.5869565217391305, 'epoch': 1.0, 'step': 26}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=26.0, style=ProgressStyle(description_wid…

{'loss': 0.638630190720925, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.0, 'step': 52}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.6500389095809724, 'eval_acc': 0.6521739130434783, 'epoch': 2.0, 'step': 52}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=26.0, style=ProgressStyle(description_wid…

{'loss': 0.5455349649374301, 'learning_rate': 1.4e-05, 'epoch': 3.0, 'step': 78}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.6657789233658049, 'eval_acc': 0.644927536231884, 'epoch': 3.0, 'step': 78}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=26.0, style=ProgressStyle(description_wid…

{'loss': 0.4305335512528053, 'learning_rate': 1.2e-05, 'epoch': 4.0, 'step': 104}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.7382265072729852, 'eval_acc': 0.6304347826086957, 'epoch': 4.0, 'step': 104}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=26.0, style=ProgressStyle(description_wid…

{'loss': 0.31726909371522755, 'learning_rate': 1e-05, 'epoch': 5.0, 'step': 130}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.7875059536761708, 'eval_acc': 0.6521739130434783, 'epoch': 5.0, 'step': 130}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=26.0, style=ProgressStyle(description_wid…

{'loss': 0.2440424498457175, 'learning_rate': 8.000000000000001e-06, 'epoch': 6.0, 'step': 156}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.9307400600777732, 'eval_acc': 0.6304347826086957, 'epoch': 6.0, 'step': 156}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=26.0, style=ProgressStyle(description_wid…

{'loss': 0.18724276411991853, 'learning_rate': 6e-06, 'epoch': 7.0, 'step': 182}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.8657586905691359, 'eval_acc': 0.644927536231884, 'epoch': 7.0, 'step': 182}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=26.0, style=ProgressStyle(description_wid…

{'loss': 0.1584947444498539, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.0, 'step': 208}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.9848082649211088, 'eval_acc': 0.6231884057971014, 'epoch': 8.0, 'step': 208}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=26.0, style=ProgressStyle(description_wid…

{'loss': 0.12609536229417875, 'learning_rate': 2.0000000000000003e-06, 'epoch': 9.0, 'step': 234}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{'eval_loss': 0.9826193393932449, 'eval_acc': 0.6304347826086957, 'epoch': 9.0, 'step': 234}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=26.0, style=ProgressStyle(description_wid…

{'loss': 0.11493157795988597, 'learning_rate': 0.0, 'epoch': 10.0, 'step': 260}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{'eval_loss': 1.033199080162578, 'eval_acc': 0.6340579710144928, 'epoch': 10.0, 'step': 260}




In [12]:
import torch
torch.Tensor([1,3,4]).cuda()

tensor([1., 3., 4.], device='cuda:0')

In [13]:
eval_results = {}
if training_args.do_eval:
    #logger.info("*** Evaluate ***")

    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_datasets = [eval_dataset]

    for eval_dataset in eval_datasets:
        eval_result = trainer.evaluate(eval_dataset=eval_dataset)

        output_eval_file = os.path.join(
            training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt"
        )
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                #logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name))
                for key, value in eval_result.items():
                    #logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

        eval_results.update(eval_result)

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{'eval_loss': 1.033199080162578, 'eval_acc': 0.6340579710144928, 'epoch': 10.0, 'step': 260}


In [14]:
eval_results

{'eval_loss': 1.033199080162578, 'eval_acc': 0.6340579710144928, 'epoch': 10.0}

In [15]:
a = torch.randn(4, 2, 3)

In [16]:
a

tensor([[[-0.2522, -0.0436, -0.1934],
         [-0.1738,  0.4931, -0.2159]],

        [[ 1.0716, -0.3169,  0.6174],
         [-0.3430, -0.6397, -1.6969]],

        [[-0.8621, -0.9359, -1.0578],
         [-0.3730, -1.1221, -1.1382]],

        [[-0.9230,  0.4700, -0.6956],
         [-1.0813, -0.1037,  0.7374]]])

In [17]:
a.mean(dim=1).shape

torch.Size([4, 3])