In [1]:
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np

In [2]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    T5Tokenizer,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    T5ForConditionalGeneration,
    set_seed )

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [4]:
from utils_glue_T5_classification import (
    T5GlueDataset, 
    T5_glue_compute_metrics, 
    T5_glue_output_modes, 
    T5_glue_tasks_num_labels,
    processors,
    Split
)

In [5]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )
        
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())})
    data_dir: str = field(metadata={"help": "Should contain the data files for the task."})
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    
    def __post_init__(self):
        self.task_name = self.task_name.lower()   

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
logger = logging.getLogger(__name__)

MODEL_NAME = "t5-small"
DATESTAMP = "20200814"
SUPER_GLUE_DIR = "/home/keyur/medhas/glue_data/"
TASK_NAME = "SST-2"
PER_DEVICE_BATCH_SIZE = 48
EXPERIMENT_DIR="/mnt/data/medhas/glue_experiments/%s/%s"%(MODEL_NAME, DATESTAMP)

custom_sysargv = [
"--model_name_or_path=%s"%MODEL_NAME,
"--task_name=%s"%TASK_NAME,
"--do_train",
"--do_eval",
"--data_dir=%s"%os.path.join(SUPER_GLUE_DIR, TASK_NAME),
"--max_seq_length=256",
"--per_device_train_batch_size=%s"%PER_DEVICE_BATCH_SIZE,
"--per_device_eval_batch_size=%s"%PER_DEVICE_BATCH_SIZE,
"--learning_rate=2e-5",
"--num_train_epochs=5",
"--output_dir=%s"%os.path.join(EXPERIMENT_DIR, TASK_NAME),
"--logging_dir=%s/logs"%os.path.join(EXPERIMENT_DIR, TASK_NAME),
"--logging_steps=702",
"--evaluate_during_training",
"--eval_step=702",
"--save_total_limit=2",
"--save_steps=1000",
"--gradient_accumulation_steps=1",
"--overwrite_output_dir"
]

model_args, data_args, training_args = parser.parse_args_into_dataclasses(args=custom_sysargv)

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.WARN if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

set_seed(training_args.seed)
training_args.seed
num_labels = T5_glue_tasks_num_labels[data_args.task_name]
output_mode = T5_glue_output_modes[data_args.task_name]
print ("Task:", data_args.task_name, "Labels:", num_labels, ', Output', output_mode)




Task: sst-2 Labels: 2 , Output classification


In [6]:
T5_glue_tasks_num_labels

{'sst-2': 2, 'cola': 2}

In [7]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else     model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

In [9]:
model = T5ForConditionalGeneration.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
)

In [10]:
train_dataset = T5GlueDataset(data_args.data_dir, tokenizer=tokenizer, 
                    task=data_args.task_name, max_seq_length=data_args.max_seq_length, 
                    overwrite_cache=data_args.overwrite_cache, mode=Split.train,) if training_args.do_train else None

eval_dataset = T5GlueDataset(data_args.data_dir, tokenizer=tokenizer, 
                    task=data_args.task_name, max_seq_length=data_args.max_seq_length, 
                    overwrite_cache=data_args.overwrite_cache, mode=Split.dev,) if training_args.do_eval else None

test_dataset = T5GlueDataset(data_args.data_dir, tokenizer=tokenizer, 
                    task=data_args.task_name, max_seq_length=data_args.max_seq_length, 
                    overwrite_cache=data_args.overwrite_cache, mode=Split.test,) if training_args.do_predict else None

In [11]:
def compute_metrics(p: EvalPrediction) -> Dict:
    if output_mode == "classification":
        preds = np.argmax(p.predictions, axis=2)
    elif output_mode == "regression":
        preds = np.squeeze(p.predictions)
    return T5_glue_compute_metrics(data_args.task_name, preds, p.label_ids)

In [12]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=702.0, style=ProgressStyle(description_wi…

