In [1]:
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np

In [2]:
from transformers import AutoConfig, AutoModelForSequenceClassification, BertForNextSentencePrediction, \
        AutoTokenizer, EvalPrediction, SuperGlueDataset, BertForNLI

In [3]:
from transformers import SuperGlueDataTrainingArguments as DataTrainingArguments
from transformers import HfArgumentParser, Trainer, TrainingArguments
from transformers import superglue_compute_metrics, superglue_output_modes, superglue_tasks_num_labels, set_seed

In [4]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
logger = logging.getLogger(__name__)

MODEL_NAME = "bert-base-cased"
DATESTAMP = "20200805"
SUPER_GLUE_DIR = "/home/keyur/medhas/superglue_data/"
TASK_NAME = "BoolQ"
PER_DEVICE_BATCH_SIZE = 48
EXPERIMENT_DIR="/mnt/data/medhas/glue_experiments/%s/%s"%(MODEL_NAME, DATESTAMP)

custom_sysargv = [
"--model_name_or_path=%s"%MODEL_NAME,
"--task_name=%s"%TASK_NAME,
"--do_train",
"--do_eval",
"--data_dir=%s"%os.path.join(SUPER_GLUE_DIR, TASK_NAME),
"--max_seq_length=256",
"--per_device_train_batch_size=%s"%PER_DEVICE_BATCH_SIZE,
"--learning_rate=2e-5",
"--num_train_epochs=10",
"--output_dir=%s"%os.path.join(EXPERIMENT_DIR, TASK_NAME),
"--logging_dir=%s/logs"%os.path.join(EXPERIMENT_DIR, TASK_NAME),
"--logging_steps=99",
"--evaluate_during_training",
"--eval_step=99",
"--save_total_limit=2",
"--save_steps=1000",
"--gradient_accumulation_steps=1",
"--overwrite_output_dir"
]

model_args, data_args, training_args = parser.parse_args_into_dataclasses(args=custom_sysargv)

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.WARN if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

set_seed(training_args.seed)
training_args.seed
num_labels = superglue_tasks_num_labels[data_args.task_name]
output_mode = superglue_output_modes[data_args.task_name]
print ("Task:", data_args.task_name, "Labels:", num_labels, ', Output', output_mode)




Task: boolq Labels: 2 , Output classification


In [5]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else     model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

In [7]:
#model = AutoModelForSequenceClassification.from_pretrained(
#        model_args.model_name_or_path,
#        from_tf=bool(".ckpt" in model_args.model_name_or_path),
#        config=config,
#        cache_dir=model_args.cache_dir,
#)
model = BertForNLI.from_pretrained(model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir)

- This IS expected if you are initializing BertForNLI from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForNLI from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_dataset = SuperGlueDataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
eval_dataset = SuperGlueDataset(data_args, tokenizer=tokenizer, mode="dev") if training_args.do_eval else None
test_dataset = SuperGlueDataset(data_args, tokenizer=tokenizer, mode="test") if training_args.do_predict else None

In [9]:
def compute_metrics(p: EvalPrediction) -> Dict:
    if output_mode == "classification":
        preds = np.argmax(p.predictions, axis=1)
    elif output_mode == "regression":
        preds = np.squeeze(p.predictions)
    return superglue_compute_metrics(data_args.task_name, preds, p.label_ids)

In [10]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [11]:
if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=99.0, style=ProgressStyle(description_wid…



{'loss': 0.6426598959498935, 'learning_rate': 1.8e-05, 'epoch': 1.0, 'step': 99}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=205.0, style=ProgressStyle(description_w…


{'eval_loss': 0.6082989236203635, 'eval_acc': 0.6647292750076476, 'epoch': 1.0, 'step': 99}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=99.0, style=ProgressStyle(description_wid…

{'loss': 0.5564364890257517, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.0, 'step': 198}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=205.0, style=ProgressStyle(description_w…


{'eval_loss': 0.5727023165400436, 'eval_acc': 0.7066381156316917, 'epoch': 2.0, 'step': 198}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=99.0, style=ProgressStyle(description_wid…

{'loss': 0.44741966778581793, 'learning_rate': 1.4e-05, 'epoch': 3.0, 'step': 297}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=205.0, style=ProgressStyle(description_w…


{'eval_loss': 0.5819953637152183, 'eval_acc': 0.7148975221780361, 'epoch': 3.0, 'step': 297}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=99.0, style=ProgressStyle(description_wid…

{'loss': 0.3333263055543707, 'learning_rate': 1.2e-05, 'epoch': 4.0, 'step': 396}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=205.0, style=ProgressStyle(description_w…


{'eval_loss': 0.6503210460267416, 'eval_acc': 0.7191801774242887, 'epoch': 4.0, 'step': 396}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=99.0, style=ProgressStyle(description_wid…

{'loss': 0.24021312820188928, 'learning_rate': 1e-05, 'epoch': 5.0, 'step': 495}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=205.0, style=ProgressStyle(description_w…


{'eval_loss': 0.7897936291810943, 'eval_acc': 0.7032731722239217, 'epoch': 5.0, 'step': 495}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=99.0, style=ProgressStyle(description_wid…

{'loss': 0.16839161131418112, 'learning_rate': 8.000000000000001e-06, 'epoch': 6.0, 'step': 594}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=205.0, style=ProgressStyle(description_w…


{'eval_loss': 0.8969934217450095, 'eval_acc': 0.7170388498011624, 'epoch': 6.0, 'step': 594}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=99.0, style=ProgressStyle(description_wid…

{'loss': 0.1270706830075895, 'learning_rate': 6e-06, 'epoch': 7.0, 'step': 693}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=205.0, style=ProgressStyle(description_w…


{'eval_loss': 1.0273771340527185, 'eval_acc': 0.7115325787702661, 'epoch': 7.0, 'step': 693}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=99.0, style=ProgressStyle(description_wid…

{'loss': 0.08479353809973808, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.0, 'step': 792}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=205.0, style=ProgressStyle(description_w…


{'eval_loss': 1.164140074173125, 'eval_acc': 0.7136739063933925, 'epoch': 8.0, 'step': 792}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=99.0, style=ProgressStyle(description_wid…

{'loss': 0.06413723598939902, 'learning_rate': 2.0000000000000003e-06, 'epoch': 9.0, 'step': 891}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=205.0, style=ProgressStyle(description_w…


{'eval_loss': 1.249064887805683, 'eval_acc': 0.712144386662588, 'epoch': 9.0, 'step': 891}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=99.0, style=ProgressStyle(description_wid…

{'loss': 0.0499358701171598, 'learning_rate': 0.0, 'epoch': 10.0, 'step': 990}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=205.0, style=ProgressStyle(description_w…


{'eval_loss': 1.2894211150524093, 'eval_acc': 0.7103089629856225, 'epoch': 10.0, 'step': 990}




In [12]:
import torch
torch.Tensor([1,3,4]).cuda()

tensor([1., 3., 4.], device='cuda:0')

In [13]:
eval_results = {}
if training_args.do_eval:
    #logger.info("*** Evaluate ***")

    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_datasets = [eval_dataset]

    for eval_dataset in eval_datasets:
        eval_result = trainer.evaluate(eval_dataset=eval_dataset)

        output_eval_file = os.path.join(
            training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt"
        )
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                #logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name))
                for key, value in eval_result.items():
                    #logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

        eval_results.update(eval_result)

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=205.0, style=ProgressStyle(description_w…


{'eval_loss': 1.2894211150524093, 'eval_acc': 0.7103089629856225, 'epoch': 10.0, 'step': 990}


In [14]:
eval_results

{'eval_loss': 1.2894211150524093,
 'eval_acc': 0.7103089629856225,
 'epoch': 10.0}

In [15]:
a = torch.randn(4, 2, 3)

In [16]:
a

tensor([[[ 0.7827, -0.6216, -0.0256],
         [-1.1225,  2.0533,  0.9891]],

        [[ 1.0652,  1.2909,  0.0879],
         [ 2.0127, -0.2890,  0.4066]],

        [[ 0.8808, -0.8394,  0.0947],
         [-0.0637, -0.0680, -1.1299]],

        [[-0.3843, -1.7386, -0.9365],
         [-2.5077, -0.4506, -1.2704]]])

In [17]:
a.mean(dim=1).shape

torch.Size([4, 3])