In [None]:
# !export CUDA_VISIBLE_DEVICES=""

In [None]:
from transformers import (RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForCausalLM, EncoderDecoderModel)

from models import CustomEncoderDecoderModel
from data_collator import DataCollatorForSeq2Seq
from trainer import CustomTrainer, CustomTrainingArguments

from tqdm.notebook import tqdm
tqdm.pandas()
from typing import Optional, Any, Union, List, Dict, Tuple
from datasets import Dataset, DatasetDict, load_metric
import torch
import numpy as np
import pandas as pd
import random
import os
import copy
import json
import re

# Fix

## experiment variable

In [None]:
def set_seed(seed):
    """set random seed."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
set_seed(4321)

DECODER_CLASSES = {'roberta-base': (RobertaForCausalLM, RobertaConfig)}
# DATASET_PATH = "dataset-ifttt-zenodo"
DATASET_PATH = "dataset-original"
os.path.exists(DATASET_PATH)

# specify pretrained model
MODEL = "roberta"
assert(MODEL in ('roberta', 'codebert'))

# specify training data
EXPERIMENT = "merged-prefix-ch-fc-field-interactive"
assert(EXPERIMENT in ('merged-prefix-ch-fc-field-oneshot', 'merged-prefix-ch-fc-field-interactive'))

OUTPUT_DIR = "models/rob2rand_merged_w_prefix_interactive_5-6-2022"

LOAD_FROM_CKPT = True
if LOAD_FROM_CKPT:
    ckpt = "models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050"
    # assert(os.path.exists(ckpt) == True)

DEBUG = None
DATA_NUM = 128 if DEBUG else None
NUM_BEAMS = 10
RETURN_TOP_K = 10

# setting for the tokenizer
MAX_INPUT_LENGTH = 250 
MAX_TARGET_LENGTH = 150

In [None]:
args = CustomTrainingArguments(
    f"{OUTPUT_DIR}",
    evaluation_strategy="steps",
    save_strategy="steps",
    # save_steps=5000 if not DEBUG else 1,
    # logging_steps=500 if not DEBUG else 1,
    do_eval=True,
    do_train=True,
    learning_rate=5e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.0,
    warmup_steps=1000,
    save_total_limit=3,
    num_train_epochs=3 if not DEBUG else 3,
    predict_with_generate=True,
    # fp16=True,
    optim='adamw_torch',
    generation_num_beams=NUM_BEAMS if NUM_BEAMS else None,
    generation_max_length=MAX_TARGET_LENGTH,
    num_return_sequences=RETURN_TOP_K,
    metrics_to_check=[('eval_bleu_em', True)],
    no_cuda=False)

## load dataset

In [None]:
def get_dataset_path(root=DATASET_PATH, exp=EXPERIMENT):
    
    def helper_prepare_data(x, mode):
        temp_list = x.split("<sep>")
        temp_list = [item.strip() for item in temp_list]

        if mode=="ac":
            temp_list = temp_list[:2].copy()
        elif mode=="af":
            temp_list = temp_list[:3].copy()

        return " ".join(temp_list)
    
    datapath = os.path.join(root, "processed.csv")
    df = pd.read_csv(datapath)
    if exp == "merged-prefix-ch-fc-field-oneshot":
        prefix_ch="GENERATE CHANNEL ONLY WITHOUT FUNCTION <pf> "
        prefix_fc="GENERATE CHANNEL AND FUNCTION FOR BOTH TRIGGER AND ACTION <pf> "
        prefix_fd="GENERATE ON THE FIELD-LEVEL GRANULARITY <pf> "
        function=df[df.granularity=="function"].copy()
        function["source"] = function.source.apply(lambda x: prefix_fc + x)
        
        channel=df[df.granularity=="channel"].copy()
        channel["source"] = channel.source.apply(lambda x: prefix_ch + x)
        
        field=df[df.granularity=="field"].copy()
        field["source"] = field.source.apply(lambda x: prefix_fd + x)
        
        df = pd.concat([channel, function, field])
        # df.drop(columns=["granularity"], inplace=True)
        
        df_dict={
                # 'train_all': df[df.split=='train'].copy(),
                # 'val_all': df[df.split=='val'].copy(), 
                'gold_all': df[df.split=='gold'].copy(),
                'noisy_all': df[df.split=='noisy'].copy(),
                'gold_function': df[(df.split=='gold') & (df.granularity=='function')].copy(),
                'gold_channel': df[(df.split=='gold') & (df.granularity=='channel')].copy(),
                'gold_field': df[(df.split=='gold') & (df.granularity=='field')].copy(),
                'noisy_function': df[(df.split=='noisy') & (df.granularity=='function')].copy(),
                'noisy_channel': df[(df.split=='noisy') & (df.granularity=='channel')].copy(),
                'noisy_field': df[(df.split=='noisy') & (df.granularity=='field')].copy()}
        
    elif exp == "merged-prefix-ch-fc-field-interactive":
        prefix_tc="GENERATE TRIGGER CHANNEL <pf> "
        prefix_tf="GENERATE TRIGGER FUNCTION <pf> "
        prefix_ac="GENERATE ACTION CHANNEL <pf> "
        prefix_af="GENERATE ACTION FUNCTION <pf> "
        df = df[df.granularity=="function"].copy()
        
        function_tc = df.copy()
        function_tc['target'] = function_tc.target.apply(lambda x: x.split("<sep>")[0].strip())
        function_tc['source'] = function_tc.source.apply(lambda x: prefix_tc + x)
        
        function_tf = df.copy()
        function_tf['temp'] = function_tf.target.apply(lambda x: x.split("<sep>")[0].strip())
        function_tf['target'] = function_tf.target.apply(lambda x: x.split("<sep>")[1].strip())
        function_tf['source'] = function_tf.apply(lambda x: prefix_tf + x.source + " <out> " + x.temp, axis=1)
        function_tf.drop(columns=["temp"], inplace=True)
        
        function_ac = df.copy()
        function_ac['temp'] = function_ac.target.apply(lambda x: helper_prepare_data(x, mode="ac"))
        function_ac['target'] = function_ac.target.apply(lambda x: x.split("<sep>")[2].strip())
        function_ac['source'] = function_ac.apply(lambda x: prefix_ac + x.source + " <out> " + x.temp, axis=1)
        function_ac.drop(columns=["temp"], inplace=True)
        
        function_af = df.copy()
        function_af['temp'] = function_af.target.apply(lambda x: helper_prepare_data(x, mode="af"))
        function_af['target'] = function_af.target.apply(lambda x: x.split("<sep>")[3].strip())
        function_af['source'] = function_af.apply(lambda x: prefix_af + x.source + " <out> " + x.temp, axis=1)
        function_af.drop(columns=["temp"], inplace=True)
        
        df = pd.concat([function_tc, function_tf, function_ac, function_af])
        
        df_dict={
                # 'train_all': df[df.split=='train'].copy(),
                # 'val_all': df[df.split=='val'].copy(), 
                'gold_all': df[df.split=='gold'].copy(),
                'noisy_all': df[df.split=='noisy'].copy(),
                'gold_tc': function_tc[(function_tc.split=='gold')].copy(),
                'gold_tf': function_tf[(function_tf.split=='gold')].copy(),
                'gold_ac': function_ac[(function_ac.split=='gold')].copy(),
                'gold_af': function_af[(function_af.split=='gold')].copy(),
                'noisy_tc': function_tc[(function_tc.split=='noisy')].copy(),
                'noisy_tf': function_tf[(function_tf.split=='noisy')].copy(),
                'noisy_ac': function_ac[(function_ac.split=='noisy')].copy(),
                'noisy_af': function_af[(function_af.split=='noisy')].copy()}
    return df_dict
df_dict = get_dataset_path()

In [None]:
# df_dict['gold_af'].sample(n=10, random_state=1234)

In [None]:
def convert_to_dataset(exp=EXPERIMENT, df_dict=df_dict):
    if exp == "merged-prefix-ch-fc-field-oneshot":
        # train_all = Dataset.from_pandas(df_dict['train_all']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        # val_all = Dataset.from_pandas(df_dict['val_all']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        gold_all = Dataset.from_pandas(df_dict['gold_all']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        noisy_all = Dataset.from_pandas(df_dict['noisy_all']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        gold_function = Dataset.from_pandas(df_dict['gold_function']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        gold_channel = Dataset.from_pandas(df_dict['gold_channel']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        gold_field = Dataset.from_pandas(df_dict['gold_field']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        noisy_function = Dataset.from_pandas(df_dict['noisy_function']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        noisy_channel = Dataset.from_pandas(df_dict['noisy_channel']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        noisy_field = Dataset.from_pandas(df_dict['noisy_field']).remove_columns(['__index_level_0__', 'split', 'granularity'])

        return DatasetDict({
                            # 'train_all':train_all,
                            # 'val_all':val_all,
                            'gold_all':gold_all,
                            'noisy_all':noisy_all,
                            'gold_function': gold_function,
                            'gold_channel': gold_channel,
                            'gold_field': gold_field,
                            'noisy_function': noisy_function,
                            'noisy_channel': noisy_channel,
                            'noisy_field': noisy_field
                           })
    elif exp == "merged-prefix-ch-fc-field-interactive":
        # train_all = Dataset.from_pandas(df_dict['train_all']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        # val_all = Dataset.from_pandas(df_dict['val_all']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        gold_all = Dataset.from_pandas(df_dict['gold_all']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        noisy_all = Dataset.from_pandas(df_dict['noisy_all']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        gold_tc = Dataset.from_pandas(df_dict['gold_tc']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        gold_tf = Dataset.from_pandas(df_dict['gold_tf']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        gold_ac = Dataset.from_pandas(df_dict['gold_ac']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        gold_af = Dataset.from_pandas(df_dict['gold_af']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        noisy_tc = Dataset.from_pandas(df_dict['noisy_tc']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        noisy_tf = Dataset.from_pandas(df_dict['noisy_tf']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        noisy_ac = Dataset.from_pandas(df_dict['noisy_ac']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        noisy_af = Dataset.from_pandas(df_dict['noisy_af']).remove_columns(['__index_level_0__', 'split', 'granularity'])
        
        return DatasetDict({
                            # 'train_all':train_all,
                            # 'val_all':val_all,
                            'gold_all':gold_all,
                            'noisy_all':noisy_all,
                            'gold_tc': gold_tc,
                            'gold_tf': gold_tf,
                            'gold_ac': gold_ac,
                            'gold_af': gold_af,
                            'noisy_tc': noisy_tc,
                            'noisy_tf': noisy_tf,
                            'noisy_ac': noisy_ac,
                            'noisy_af': noisy_af
                           })

dataset = convert_to_dataset()

print(dataset.column_names)
print([dataset['noisy_af'][0]])

## load tokenizer

In [None]:
def load_tokenizer(model=MODEL):
    if LOAD_FROM_CKPT:
        tokenizer = RobertaTokenizer.from_pretrained(ckpt)
    else:
        if model == 'roberta':
            tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        elif model == 'codebert':
            tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
        else:
            raise ValueError(f"Undefined model type")
    return tokenizer

tokenizer = load_tokenizer()

In [None]:
def preprocess_function(examples):
    inputs = [ex for ex in examples["source"]]
    targets = [ex for ex in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding=False)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True, padding=False)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["gold_all"].column_names,
)


for item in tokenized_datasets['noisy_all'][:8]['input_ids']:
    print(item)

## load model

In [None]:
if LOAD_FROM_CKPT:
    model = EncoderDecoderModel.from_pretrained(ckpt)
    print(f"Loading from {ckpt}")
else:
    model = CustomEncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base", random_decoder=True, model_dict=DECODER_CLASSES)
    print("Loading not from checkpoint")
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.architectures = "EncoderDecoderModel"
model.config.max_length = MAX_TARGET_LENGTH

## data collator

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
if DEBUG:
    batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
    batch.keys()
    print(batch["labels"])

# custom trainer

In [None]:
# this is not actually used, but still needed because the behaviour of the trainer is weird without this
def compute_metrics(eval_preds):
    
    def decode_preds(eval_preds):
        preds, labels = eval_preds
        # In case the model returns more than the prediction logits
        if isinstance(preds, tuple):
            preds = preds[0]

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

        # Replace -100s in the labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Some simple post-processing
        decoded_preds = [pred.split("<pf>")[-1].strip() for pred in decoded_preds]
        decoded_labels = [[label.split("<pf>")[-1].strip()] for label in decoded_labels]
        return decoded_preds, decoded_labels
    
    decoded_preds, decoded_labels = decode_preds(eval_preds)
    
    bleu_dict = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    
    # decoded_preds = [pred[0] for pred in decoded_preds]
    decoded_labels = [label[0] for label in decoded_labels]
    em_dict = em.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": bleu_dict["score"],
           "em": em_dict['exact_match'],
           "bleu_em": (bleu_dict['score']+em_dict['exact_match'])/2}
###

In [None]:
trainer = CustomTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["gold_all"],
    eval_dataset=tokenized_datasets["noisy_all"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
output_dir_inference=f"models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_gold_tc"
trainer.inference(output_dir_inference=output_dir_inference, 
                 eval_dataset=tokenized_datasets['gold_tc'])

In [None]:
output_dir_inference=f"models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_gold_tf"
trainer.inference(output_dir_inference=output_dir_inference, 
                 eval_dataset=tokenized_datasets['gold_tf'])

In [None]:
output_dir_inference=f"models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_gold_ac"
trainer.inference(output_dir_inference=output_dir_inference, 
                 eval_dataset=tokenized_datasets['gold_ac'])

In [None]:
output_dir_inference=f"models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_gold_af"
trainer.inference(output_dir_inference=output_dir_inference, 
                 eval_dataset=tokenized_datasets['gold_af'])

In [None]:
output_dir_inference=f"models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_noisy_tc"
trainer.inference(output_dir_inference=output_dir_inference, 
                 eval_dataset=tokenized_datasets['noisy_tc'])

In [None]:
output_dir_inference=f"models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_noisy_tf"
trainer.inference(output_dir_inference=output_dir_inference, 
                 eval_dataset=tokenized_datasets['noisy_tf'])

In [None]:
output_dir_inference=f"models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_noisy_ac"
trainer.inference(output_dir_inference=output_dir_inference, 
                 eval_dataset=tokenized_datasets['noisy_ac'])

In [None]:
output_dir_inference=f"models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_noisy_af"
trainer.inference(output_dir_inference=output_dir_inference, 
                 eval_dataset=tokenized_datasets['noisy_af'])

# compute metrics

In [None]:
bleu = load_metric("sacrebleu")
em = load_metric("exact_match")

In [None]:
def get_predictions(path):
    preds = []
    refs = []
    files = os.listdir(path)
    files.sort(key=natural_keys)
    for item in files:
        temp_path = os.path.join(path, item)
        if os.path.isfile(temp_path):
            with open(temp_path, "r") as f:
                temp_list = f.readlines()
                temp_list = [x.strip() for x in temp_list]
            if item.endswith(".pred"):
                preds.append(temp_list)
            elif item.endswith(".gold"):
                refs.append(temp_list)
    return preds, refs

def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    '''
    alist.sort(key=natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    '''
    return [ atoi(c) for c in re.split(r'(\d+)', text) ]

def compute_metrics(preds, refs):
    def compute_mrr(preds, refs, top_k):
        temp_preds = [x[:top_k] for x in preds]
        temp_refs = [x[:top_k] for x in refs]
        mrr_k = np.array(temp_preds) == np.array(refs)
        mrr_k = mrr_k.astype("int").tolist()
        mrr_k = (np.asarray(r).nonzero()[0] for r in mrr_k)
        mrr_k = np.mean([1. / (r[0] + 1) if r.size else 0. for r in mrr_k])
        return round(mrr_k, 3)
    
    def prepare_for_bleu(input_list):
        input_list = [item[0] for item in input_list]
        input_list = [item.split("<sep>") for item in input_list]
        output_list = []
        for item in input_list:
            temp_list = []
            for subitem in item:
                temp_list.append(subitem.strip())
            output_list.append(temp_list)
        output_list = [' '.join(item) for item in output_list]
        return output_list
    
    mrr_3 = compute_mrr(preds, refs, 3)
    mrr_5 = compute_mrr(preds, refs, 5)
    mrr_10 = compute_mrr(preds, refs, 10)
    
    preds_bleu = prepare_for_bleu(preds)
    refs_bleu = prepare_for_bleu(refs)
    refs_bleu = [[x] for x in refs_bleu]
    bleu_dict = bleu.compute(predictions=preds_bleu, references=refs_bleu)
    
    preds = [pred[0] for pred in preds]
    refs = [label[0] for label in refs]
    em_dict = em.compute(predictions=preds, references=refs)
    return {"bleu": round(bleu_dict["score"], 3),
           "em": round(em_dict['exact_match'], 3),
           "bleu_em": round((bleu_dict['score']+em_dict['exact_match'])/2, 3),
           "mrr_3": mrr_3,
           "mrr_5": mrr_5,
           "mrr_10": mrr_10}

In [None]:
path = "models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_gold_tc"
preds, refs = get_predictions(path=path)

In [None]:
compute_metrics(preds, refs)

In [None]:
path = "models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_gold_tf"
preds, refs = get_predictions(path=path)

In [None]:
compute_metrics(preds, refs)

In [None]:
path = "models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_gold_ac"
preds, refs = get_predictions(path=path)

In [None]:
compute_metrics(preds, refs)

In [None]:
path = "models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_gold_af"
preds, refs = get_predictions(path=path)

In [None]:
compute_metrics(preds, refs)

In [None]:
path = "models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_noisy_tc"
preds, refs = get_predictions(path=path)

In [None]:
compute_metrics(preds, refs)

In [None]:
path = "models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_noisy_tf"
preds, refs = get_predictions(path=path)

In [None]:
compute_metrics(preds, refs)

In [None]:
path = "models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_noisy_ac"
preds, refs = get_predictions(path=path)

In [None]:
compute_metrics(preds, refs)

In [None]:
path = "models/rob2rand_merged_w_prefix_interactive_5-6-2022/checkpoint-427050/interactive_noisy_af"
preds, refs = get_predictions(path=path)

In [None]:
compute_metrics(preds, refs)

# push model to the hub

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

In [None]:
trainer.args.output_dir = "rob2rand_merged_w_prefix_c_fc_interactive"

In [None]:
trainer.model.config

In [None]:
trainer.push_to_hub()