In [1]:
from transformers import (RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForCausalLM)

from models import CustomEncoderDecoderModel
from data_collator import DataCollatorForSeq2Seq
from trainer import CustomTrainer, CustomTrainingArguments

from tqdm.notebook import tqdm
tqdm.pandas()
from typing import Optional, Any, Union, List, Dict, Tuple
from datasets import Dataset, DatasetDict, load_metric
import torch
import numpy as np
import pandas as pd
import random
import os

In [9]:
from transformers import EncoderDecoderModel
from tqdm.notebook import tqdm
tqdm.pandas()

from transformers import (RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForCausalLM)
from trainer import CustomTrainer, CustomTrainingArguments
from data_collator import DataCollatorForSeq2Seq

from datasets import Dataset, DatasetDict, load_metric
import random
import os
import numpy as np
import torch
import pandas as pd

# Fix

## experiment variable

In [5]:
def set_seed(seed):
    """set random seed."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
set_seed(4321)

DECODER_CLASSES = {'roberta-base': (RobertaForCausalLM, RobertaConfig)}

DATASET_PATH = "dataset-ifttt-zenodo"
os.path.exists(DATASET_PATH)

MODEL = "roberta"
assert(MODEL in ('roberta', 'codebert'))

EXPERIMENT = "chen"
assert(EXPERIMENT in ('chen', 'mi', 'merged'))

OUTPUT_DIR = "rob2rand_chen"

LOAD_FROM_CKPT = True
if LOAD_FROM_CKPT:
    ckpt = "models/rob2rand_chen/checkpoint-35000"
    assert(os.path.exists(ckpt) == True)
DEBUG = None
DATA_NUM = 8 if DEBUG else None
NUM_BEAMS = 10
RETURN_TOP_K = 10

DO_INFERENCE_GOLD=True
DO_INFERENCE_NOISY=True
DO_INFERENCE_TEST_FIELD=False

# setting for the tokenizer
MAX_INPUT_LENGTH = 100 
MAX_TARGET_LENGTH = 100

In [6]:
args = CustomTrainingArguments(
    f"{OUTPUT_DIR}",
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=500 if not DEBUG else 1,
    logging_steps=500 if not DEBUG else 1,
    do_eval=True,
    do_train=True,
    learning_rate=5e-6,
    per_device_train_batch_size=64 if not (DO_INFERENCE_GOLD or DO_INFERENCE_NOISY or DO_INFERENCE_TEST_FIELD) else 32,
    per_device_eval_batch_size=64 if not (DO_INFERENCE_GOLD or DO_INFERENCE_NOISY or DO_INFERENCE_TEST_FIELD) else 32,
    weight_decay=0.0,
    warmup_steps=1000,
    save_total_limit=3,
    num_train_epochs=50 if not DEBUG else 3,
    predict_with_generate=True,
    fp16=True,
    optim='adamw_torch',
    generation_num_beams=NUM_BEAMS if NUM_BEAMS else None,
    generation_max_length=MAX_TARGET_LENGTH,
    num_return_sequences=RETURN_TOP_K,
    metrics_to_check=[('eval_bleu', True), 
                      ('eval_bleu_em', True),
                      ('eval_em', True)]
)

## load dataset

In [7]:
def get_dataset_path(root=DATASET_PATH, exp=EXPERIMENT):
    if exp=="chen":
        train_path = os.path.join(root, "ready-train-chen-only/train-chen.pkl")
        val_path = os.path.join(root, "ready-train-val-noisy/validation-noisy.pkl")
        gold_path = os.path.join(root, "ready-test-clean/test_gold_clean.pkl")
        noisy_path = os.path.join(root, "ready-test-clean/test_intel_clean.pkl")
    return {"train": train_path,
            "val": val_path,
            "gold": gold_path,
            "noisy": noisy_path}

path_dict = get_dataset_path()

In [10]:
def load_dataset(path_dict=path_dict, number=None):
    assert(type(path_dict)==dict)
    df_dict = {}
    for split, path in path_dict.items():
        if number:
            df_dict[split] = pd.read_pickle(path).sample(n=number, random_state=1234).copy()
        else:
            df_dict[split] = pd.read_pickle(path)
    return df_dict

if DATA_NUM:
    df_dict = load_dataset(number=DATA_NUM)
else:
    df_dict = load_dataset()

df_dict['train'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45003 entries, 0 to 45005
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  45003 non-null  object
 1   target  45003 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


In [11]:
def convert_to_dataset(df_dict=df_dict):
    train = Dataset.from_pandas(df_dict['train']).remove_columns(['__index_level_0__'])
    val = Dataset.from_pandas(df_dict['val']).remove_columns(['__index_level_0__'])
    gold = Dataset.from_pandas(df_dict['gold']).remove_columns(['__index_level_0__'])
    noisy = Dataset.from_pandas(df_dict['noisy']).remove_columns(['__index_level_0__'])
    
    return DatasetDict({'train':train,
                        'val':val,
                        'gold':gold,
                        'noisy':noisy})

dataset = convert_to_dataset()

if DEBUG:
    print(dataset.column_names)
    print([dataset['train'][0]])

## load tokenizer

In [12]:
def load_tokenizer(model=MODEL):
    if model == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    elif model == 'codebert':
        tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
    else:
        raise ValueError(f"Undefined model type")
    return tokenizer

tokenizer = load_tokenizer()

In [13]:
def preprocess_function(examples):
    inputs = [ex for ex in examples["source"]]
    targets = [ex for ex in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding=False)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True, padding=False)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

if DEBUG:
    for item in tokenized_datasets['train'][:8]['input_ids']:
        print(item)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=46.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




## load model

In [14]:
if LOAD_FROM_CKPT:
    model = EncoderDecoderModel.from_pretrained(ckpt)
    print(f"Loading from {ckpt}")
else:
    model = EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base", random_decoder=True, model_dict=DECODER_CLASSES)
    print("Loading not from checkpoint")
    model.config.decoder_start_token_id = tokenizer.cls_token_id
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.vocab_size = model.config.decoder.vocab_size
    model.config.architectures = "EncoderDecoderModel"
    model.config.max_length = 100

Loading from models/rob2rand_chen/checkpoint-35000


## data collator

In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
if DEBUG:
    batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
    batch.keys()
    print(batch["labels"])

# metric

In [16]:
bleu = load_metric("sacrebleu")
em = load_metric("exact_match")

def compute_metrics(eval_preds):
    
    def decode_preds(eval_preds):
        preds, labels = eval_preds
        # In case the model returns more than the prediction logits
        if isinstance(preds, tuple):
            preds = preds[0]

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

        # Replace -100s in the labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Some simple post-processing
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        return decoded_preds, decoded_labels
    
    decoded_preds, decoded_labels = decode_preds(eval_preds)
    
    bleu_dict = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    
    # decoded_preds = [pred[0] for pred in decoded_preds]
    decoded_labels = [label[0] for label in decoded_labels]
    em_dict = em.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": bleu_dict["score"],
           "em": em_dict['exact_match'],
           "bleu_em": (bleu_dict['score']+em_dict['exact_match'])/2}

# custom trainer

In [17]:
trainer = CustomTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# result on latest ckpt

In [18]:
trainer.evaluate()



{'eval_loss': 0.07814013957977295,
 'eval_bleu': 81.21997005616592,
 'eval_em': 50.22452504317789,
 'eval_bleu_em': 65.7222475496719,
 'eval_runtime': 185.1399,
 'eval_samples_per_second': 15.637,
 'eval_steps_per_second': 0.492}

In [15]:
trainer.evaluate(eval_dataset=tokenized_datasets['gold'])

{'eval_loss': 0.01039204653352499,
 'eval_bleu': 97.7771768701106,
 'eval_em': 91.14754098360656,
 'eval_bleu_em': 188.92471785371714,
 'eval_runtime': 17.9924,
 'eval_samples_per_second': 16.952,
 'eval_steps_per_second': 0.278}

In [16]:
trainer.evaluate(eval_dataset=tokenized_datasets['noisy'])

{'eval_loss': 0.06291293352842331,
 'eval_bleu': 85.8551097560776,
 'eval_em': 53.18595578673602,
 'eval_bleu_em': 139.0410655428136,
 'eval_runtime': 47.1894,
 'eval_samples_per_second': 16.296,
 'eval_steps_per_second': 0.275}

# inference

In [19]:
if DO_INFERENCE_GOLD:
    INFERENCE_DIR_GOLD = f"{ckpt}/inference/gold"
    if not os.path.exists(INFERENCE_DIR_GOLD):
        os.makedirs(INFERENCE_DIR_GOLD)
    trainer.inference(eval_dataset=tokenized_datasets['gold'], output_dir_inference=INFERENCE_DIR_GOLD)

In [20]:
if DO_INFERENCE_NOISY:
    INFERENCE_DIR_NOISY = f"{ckpt}/inference/noisy"
    if not os.path.exists(INFERENCE_DIR_NOISY):
        os.makedirs(INFERENCE_DIR_NOISY)
    trainer.inference(eval_dataset=tokenized_datasets['noisy'], output_dir_inference=INFERENCE_DIR_NOISY)

In [None]:
if DO_INFERENCE_TEST_FIELD:
    INFERENCE_DIR_TEST_FIELD = f"{ckpt}/inference/test_field"
    if not os.path.exists(INFERENCE_DIR_TEST_FIELD):
        os.makedirs(INFERENCE_DIR_TEST_FIELD)
    trainer.inference(eval_dataset=tokenized_datasets['test_field'], output_dir_inference=INFERENCE_DIR_TEST_FIELD)