In [1]:
# --task_name [esnli, cos_e] 
# --do_train 
# --num_train_epochs 200 
# --per_device_train_batch_size 64 
# --per_device_eval_batch_size 64 
# --logging_first_step True 
# --logging_steps 1 
# --save_steps 1 
# --save_total_limit 11 
# --seed 42 
# --early_stopping_threshold 10 
# --version_name [for cos_e, specify v1.0 or v1.11]

In [2]:
!pip install nlp

You should consider upgrading via the '/home/huangyongfeng/miniconda3/envs/py3.7pytorch1.8new/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
"""
Fine-tunes a model to jointly generate labels + rationales given input.
Partially based on https://github.com/huggingface/transformers/tree/7cb203fae4e7964e9e99400b375d660ebce765ee/examples/language-modeling/run_language_modeling.py (Huggingface Transformers v2.9.1)
See Huggingface repository for licensing agreement.

Code formatted using https://github.com/psf/black
"""

import logging
import math
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    HfArgumentParser,
    TrainingArguments,
    set_seed,
)
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

from feature_conversion_methods import input_to_explanation_plus_label
from modeling_t5 import T5ForConditionalGeneration as NoisyT5ForConditionalGeneration
from trainer import Trainer
from custom_args import (
    DataTrainingArguments,
    ModelArguments,
    compute_metrics,
    compare_models_with_noise,
)
import torch
import nlp
import git
import time
from datetime import datetime
import sys
import json
import random

random.seed(10)

logger = logging.getLogger(__name__)


class SequenceCollator:
    def __init__(self, pad_token):
        self.pad_token_mapping = {
            "lm_labels": -100,
            "attention_mask": 0,
            "decoder_attention_mask": 0,
            "input_ids": pad_token,
        }
        self.columns = [
            "input_ids",
            "attention_mask",
            "lm_labels",
            "decoder_attention_mask",
        ]

    def collate_batch(self, examples):

        # batch inputs for training
        batch = {}
        for key in examples[0].keys():
            if key in self.columns:
                tmp_list = []
                for item in examples:
                    tmp_list.append(item[key])

                # pad lists to max length
                if isinstance(tmp_list[0], list):
                    max_length = max(map(len, tmp_list))
                    tmp_list = [
                        el + [self.pad_token_mapping[key]] * (max_length - len(el))
                        for el in tmp_list
                    ]

                batch[key] = torch.tensor(tmp_list, dtype=torch.long)
        return batch




In [4]:
# !pip list |grep tran
import transformers
from  transformers.models.t5 import configuration_t5

In [5]:
# def main():
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.

og_start_time = time.time()

parser = HfArgumentParser(
    (ModelArguments, DataTrainingArguments, TrainingArguments)
)
# model_args, data_args, training_args = parser.parse_args_into_dataclasses()
model_args, data_args, training_args = parser.parse_args_into_dataclasses(["--task_name", "cos_e", 
                                                                          "--do_train", "True", 
                                                                          "--num_train_epochs", "200", 
                                                                          "--per_gpu_train_batch_size", "64",
                                                                          "--per_gpu_eval_batch_size", "64", 
                                                                          "--logging_first_step", "True", 
                                                                          "--logging_steps", "1", 
                                                                          "--save_steps", "1", 
                                                                          "--save_total_limit", "11", 
                                                                          "--seed", "42", 
                                                                          "--early_stopping_threshold", "10", 
                                                                          "--version_name", "v1.11",
                                                                          "--output_dir", "./output_dir",
                                                                          ])

# training_args.set_device("cuda:5")

In [6]:
if not training_args.do_train:
    if (not model_args.pretrained_model_file) and (
        not data_args.generations_filepath
    ):
        raise Exception(
            "if not training a model from scratch, must specify a trained model to load for evaluation or generations in a file to evaluate"
        )

# make sure only one dataset split pick if manually specifying evaluation file
if data_args.generations_filepath is not None:
    training_args.do_train = False
    training_args.do_eval = False
    if "train" in data_args.generations_filepath:
        data_args.train_predict = True
        data_args.test_predict = False
        data_args.dev_predict = False
    elif "test" in data_args.generations_filepath:
        data_args.train_predict = False
        data_args.test_predict = True
        data_args.dev_predict = False
    elif "validation" in data_args.generations_filepath:
        data_args.train_predict = False
        data_args.test_predict = False
        data_args.dev_predict = True

# create a new directory if fine-tuning an existing checkpoint or training/evaluating a HF pretrained model
# do not do this when re-evaluating a pretrained_model_file
if training_args.do_train or (
    not model_args.pretrained_model_file and not data_args.generations_filepath
):
    # create a save directory and a logfile
    save_path = training_args.output_dir
    training_args.output_dir = os.path.join(
        save_path, datetime.now().strftime("%m%d%y_%H%M%S")
    )
    training_args.logging_dir = training_args.output_dir
    assert os.path.exists(save_path)
    assert not os.path.exists(training_args.output_dir)
    os.makedirs(training_args.output_dir)

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )
    handlers = [
        logging.FileHandler(os.path.join(training_args.output_dir, "logger.log")),
        logging.StreamHandler(),
    ]
else:
    # don't overwrite existing logfile or create new directory
    training_args.output_dir = model_args.pretrained_model_file
    handlers = [logging.StreamHandler()]

if data_args.encoder_noise_variance is not None:
    # must be in evaluation mode
    assert not training_args.do_train
    assert model_args.pretrained_model_file is not None
    assert data_args.test_predict or data_args.dev_predict
    assert 40 > data_args.encoder_noise_variance > 0

In [7]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    handlers=handlers,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Save path: %s" % training_args.output_dir)

# get git hash and branch where deployed
repo = git.Repo(search_parent_directories=True)
git_hash = repo.head.object.hexsha
git_branch = repo.active_branch.name
logger.info("Git branch: %s" % git_branch)
logger.info("Git hash: %s" % git_hash)

assert data_args.task_name in {"cos_e", "esnli"}

# set gradient accumulation steps to always use batch size == 64
if 64 % training_args.per_gpu_train_batch_size != 0:
    raise Exception(
        "Batch size is not a divisor of 64, resulting in inconsistent gradient-accumulation behavior"
    )
training_args.gradient_accumulation_steps = int(
    64 / training_args.per_gpu_train_batch_size
)



08/22/2022 21:07:27 - INFO - __main__ -   Save path: ./output_dir/082222_210727
08/22/2022 21:07:27 - INFO - __main__ -   Git branch: dev
08/22/2022 21:07:27 - INFO - __main__ -   Git hash: 640aa0d57986f7e8295dabb0e8ab189542f32d53


In [8]:
if training_args.do_train:
    # write command and args to file
    with open(
        os.path.join(training_args.output_dir, "commandline_args.txt"), "w"
    ) as f:
        f.write("Git branch: " + git_branch + "\n")
        f.write("Git hash: " + git_hash + "\n")
        f.write("Command:\n")
        f.write("\n".join(sys.argv[1:]))
        f.write("Training args:\n")
        # make training_args dict writeable
        tmp = training_args.__dict__
        tmp.pop("__cached__setup_devices", None)
        tmp.pop("evaluation_strategy", None)
        tmp.pop("lr_scheduler_type", None)
        tmp.pop("logging_strategy", None)
        tmp.pop("save_strategy", None)
        json.dump(tmp, f, indent=2)
        f.write("Data args:\n")
        json.dump(data_args.__dict__, f, indent=2)
        f.write("Model args:\n")
        json.dump(model_args.__dict__, f, indent=2)



In [9]:
# Set seed
set_seed(training_args.seed)

# Load pretrained model and tokenizer
logger.info("Loading pretrained tokenizer...")
if model_args.pretrained_model_file:
    # load pretrained tokenizer from directory
    tokenizer = T5Tokenizer.from_pretrained(model_args.pretrained_model_file)
else:
    # load pretrained tokenizer from Huggingface
    tokenizer = T5Tokenizer.from_pretrained("t5-base")

# found better/more controllable generation using own EOS token
tokenizer.add_special_tokens({"eos_token": "[EOS]"})
assert (
    len(tokenizer) - 1
    == tokenizer.eos_token_id
    == tokenizer.encode(["[EOS]"])[0]
    == 32100
)

if data_args.generations_filepath is None:
    if model_args.pretrained_model_file:
        # load pretrained model from directory at best checkpoint
        ckpts = [
            name
            for name in os.listdir(model_args.pretrained_model_file)
            if PREFIX_CHECKPOINT_DIR in name
        ]
        if len(ckpts) != 1:
            raise Exception(
                "more than 1 checkpoint file stored in pretrained path. revisit save directory"
            )
        model_load_path = os.path.join(model_args.pretrained_model_file, ckpts[0])
        if data_args.encoder_noise_variance is not None:
            # initialize model with noise in decoder
            model = NoisyT5ForConditionalGeneration.from_pretrained(model_load_path)
        else:
            model = T5ForConditionalGeneration.from_pretrained(model_load_path)
        if model_args.dropout_rate:
            raise Exception(
                "can't update/specify dropout currently when load pretrained model from directory"
            )

    else:
        # load pretrained model from HuggingFace
        logger.info("Loading pretrained model")
        if model_args.dropout_rate:
            model = T5ForConditionalGeneration.from_pretrained(
                "t5-base", dropout_rate=model_args.dropout_rate
            )
        else:
            model = T5ForConditionalGeneration.from_pretrained("t5-base")

    model.resize_token_embeddings(len(tokenizer))
else:
    model = None



08/22/2022 21:07:27 - INFO - __main__ -   Loading pretrained tokenizer...
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
08/22/2022 21:07:34 - INFO - __main__ -   Loading pretrained model


In [10]:
# load (new) cos-e version
if data_args.task_name == "cos_e":
    assert data_args.version_name in {"v1.11", "v1.0"}
    version_arg = data_args.version_name
else:
    version_arg = None

# Get datasets
dataset = nlp.load_dataset(data_args.task_name, version_arg)

# Apply method, and format dataset to torch.Tensor outputs
for split in dataset.keys():

    # apply independently to each example
    dataset[split] = dataset[split].map(
        lambda x: input_to_explanation_plus_label(
            x,
            tokenizer,
            datasource=data_args.task_name,
            expl_only=model_args.rationale_only,
            label_only=model_args.label_only,
        ),
        # had some replicability issues with batch/cache set to True
        batched=False,
        load_from_cache_file=False,
    )

train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"] if data_args.task_name == "esnli" else None

if data_args.task_name == "esnli":
    assert len(train_dataset) == 549367
    assert len(eval_dataset) == 9842
    assert len(test_dataset) == 9824
elif data_args.task_name == "cos_e":
    if data_args.version_name == "v1.11":
        assert len(train_dataset) == 9741
        assert len(eval_dataset) == 1221
    elif data_args.version_name == "v1.0":
        assert len(train_dataset) == 7610
        assert len(eval_dataset) == 950
    assert test_dataset is None



08/22/2022 21:07:43 - INFO - nlp.load -   Checking /home/huangyongfeng/.cache/huggingface/datasets/6f5af2f9ee4124cd3ebff0ac93fa2e94ad4bd20a828fb796875668a2c86cd09d.c298e1f9b6e456b221e78032db5488e2e3693908eecf4bb3babe8632f5a15c58.py for additional imports.
08/22/2022 21:07:43 - INFO - filelock -   Lock 139747575242704 acquired on /home/huangyongfeng/.cache/huggingface/datasets/6f5af2f9ee4124cd3ebff0ac93fa2e94ad4bd20a828fb796875668a2c86cd09d.c298e1f9b6e456b221e78032db5488e2e3693908eecf4bb3babe8632f5a15c58.py.lock
08/22/2022 21:07:43 - INFO - nlp.load -   Found main folder for dataset https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/cos_e/cos_e.py at /home/huangyongfeng/miniconda3/envs/py3.7pytorch1.8new/lib/python3.7/site-packages/nlp/datasets/cos_e
08/22/2022 21:07:43 - INFO - nlp.load -   Found specific version folder for dataset https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/cos_e/cos_e.py at /home/huangyongfeng/miniconda3/envs/py3.7pytorch1.8new/lib/python

  0%|          | 0/9741 [00:00<?, ?it/s]

08/22/2022 21:08:03 - INFO - nlp.arrow_writer -   Done writing 9741 examples in 15485109 bytes /home/huangyongfeng/.cache/huggingface/datasets/cos_e/v1.11/1.11.0/b7bc6748714e9af308ab02e900cb2b020b953b3cc865f7901164bb963a7e694b/tmpj0v23g94.
08/22/2022 21:08:03 - INFO - nlp.arrow_dataset -   Caching processed dataset at /home/huangyongfeng/.cache/huggingface/datasets/cos_e/v1.11/1.11.0/b7bc6748714e9af308ab02e900cb2b020b953b3cc865f7901164bb963a7e694b/cache-34a579c7f4f154a1967186fa19e1b06f.arrow


  0%|          | 0/1221 [00:00<?, ?it/s]

08/22/2022 21:08:05 - INFO - nlp.arrow_writer -   Done writing 1221 examples in 1900029 bytes /home/huangyongfeng/.cache/huggingface/datasets/cos_e/v1.11/1.11.0/b7bc6748714e9af308ab02e900cb2b020b953b3cc865f7901164bb963a7e694b/tmp4fdagrf6.


In [None]:
logger.info("****LOG****")
logger.info(len(train_dataset))
logger.info(len(eval_dataset))
if data_args.task_name == "esnli":
    logger.info(len(test_dataset))

if data_args.generations_filepath is None:
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_args=data_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
        data_collator=SequenceCollator(pad_token=tokenizer.pad_token_id),
    )

# Training
if training_args.do_train:
    start_time = time.time()
    trainer.train()
    trainer.save_model()
    # For convenience, we also re-save the tokenizer to the same directory
    tokenizer.save_pretrained(training_args.output_dir)
    train_time = time.time() - start_time
    model = trainer.model



08/22/2022 21:08:05 - INFO - __main__ -   ****LOG****
08/22/2022 21:08:05 - INFO - __main__ -   9741
08/22/2022 21:08:05 - INFO - __main__ -   1221
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
08/22/2022 21:08:13 - INFO - trainer -   ***** Running training *****
08/22/2022 21:08:13 - INFO - trainer -     Num examples = 9741
08/22/2022 21:08:13 - INFO - trainer -     Num Epochs = 200
08/22/2022 21:08:13 - INFO - trainer -     Instantaneous batch size per device = 64
08/22/2022 21:08:13 - INFO - trainer -     Total train batch size (w. accumulation) = 64
08/22/2022 21:08:13 - INFO - trainer -

Epoch:   0%|          | 0/200 [00:00<?, ?it/s]

Iteration:   0%|          | 0/153 [00:00<?, ?it/s]

> [0;32m/cognitive_comp/huangyongfeng/evaluate_LM_with_rationalization/label_rationale_association/trainer.py[0m(398)[0;36m_training_step[0;34m()[0m
[0;32m    396 [0;31m            [0minputs[0m[0;34m[[0m[0mk[0m[0;34m][0m [0;34m=[0m [0mv[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mself[0m[0;34m.[0m[0margs[0m[0;34m.[0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m    397 [0;31m        [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m--> 398 [0;31m        [0moutputs[0m [0;34m=[0m [0mmodel[0m[0;34m([0m[0;34m**[0m[0minputs[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m    399 [0;31m        [0;31m# model outputs are a tuple[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    400 [0;31m        [0mloss[0m [0;34m=[0m [0moutputs[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0m
[0m
ipdb> outputs = model(**inputs)
*** ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds
ipdb> inputs