In [1]:
from transformers import (RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForCausalLM, EncoderDecoderModel)

from models import CustomEncoderDecoderModel
from data_collator import DataCollatorForSeq2Seq
from trainer import CustomTrainer, CustomTrainingArguments

from tqdm.notebook import tqdm
tqdm.pandas()
from typing import Optional, Any, Union, List, Dict, Tuple
from datasets import Dataset, DatasetDict, load_metric
import torch
import numpy as np
import pandas as pd
import random
import os
import copy

In [2]:
import json

# Fix

## experiment variable

In [3]:
def set_seed(seed):
    """set random seed."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
set_seed(4321)

DECODER_CLASSES = {'roberta-base': (RobertaForCausalLM, RobertaConfig)}
# DATASET_PATH = "dataset-ifttt-zenodo"
DATASET_PATH = "dataset-original"
os.path.exists(DATASET_PATH)

# specify pretrained model
MODEL = "roberta"
assert(MODEL in ('roberta', 'codebert'))

# specify training data
EXPERIMENT = "merged-prefix-ch-fc-field"
assert(EXPERIMENT in ('chen', 'mi', 'merged', 'chen-prefix', 'chen-prefix-ch', 'chen-prefix-fc',
                     'merged-prefix-ch-fc-field'))

OUTPUT_DIR = "models/rob2rand_merged_w_prefix_2-6-22"

LOAD_FROM_CKPT = False
if LOAD_FROM_CKPT:
    ckpt = "models/rob2rand_chen_w_prefix_26-5-22/checkpoint-70000"
    # assert(os.path.exists(ckpt) == True)

DEBUG = None
DATA_NUM = 128 if DEBUG else None
NUM_BEAMS = 3
RETURN_TOP_K = 1

# setting for the tokenizer
MAX_INPUT_LENGTH = 100 
MAX_TARGET_LENGTH = 100

In [4]:
args = CustomTrainingArguments(
    f"{OUTPUT_DIR}",
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=5000 if not DEBUG else 1,
    logging_steps=500 if not DEBUG else 1,
    do_eval=True,
    do_train=True,
    learning_rate=5e-6,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.0,
    warmup_steps=1000,
    save_total_limit=3,
    num_train_epochs=3 if not DEBUG else 3,
    predict_with_generate=True,
    fp16=True,
    optim='adamw_torch',
    generation_num_beams=NUM_BEAMS if NUM_BEAMS else None,
    generation_max_length=MAX_TARGET_LENGTH,
    num_return_sequences=RETURN_TOP_K,
    metrics_to_check=[('eval_bleu_em', True)])

## load dataset

In [5]:
def get_dataset_path(root=DATASET_PATH, exp=EXPERIMENT):
    prefix_ch="GENERATE CHANNEL ONLY WITHOUT FUNCTION <pf> "
    prefix_fc="GENERATE CHANNEL AND FUNCTION FOR BOTH TRIGGER AND ACTION <pf> "
    prefix_fd="GENERATE ON THE FIELD-LEVEL GRANULARITY <pf> "
    if exp == "merged-prefix-ch-fc-field":
        datapath = os.path.join(root, "processed.csv")
        df = pd.read_csv(datapath)
        function=df[df.granularity=="function"].copy()
        function["source"] = function.source.progress_apply(lambda x: prefix_fc + x)
        
        channel=df[df.granularity=="channel"].copy()
        channel["source"] = channel.source.progress_apply(lambda x: prefix_ch + x)
        
        field=df[df.granularity=="field"].copy()
        field["source"] = field.source.progress_apply(lambda x: prefix_fd + x)
        
        df = pd.concat([channel, function, field])
        df.drop(columns=["granularity"], inplace=True)
        
        df_dict={'train': df[df.split=='train'].copy(),
                'val': df[df.split=='val'].copy(), 
                'gold': df[df.split=='gold'].copy(),
                'noisy': df[df.split=='noisy'].copy()}
    return df_dict
df_dict = get_dataset_path()

  0%|          | 0/138714 [00:00<?, ?it/s]

  0%|          | 0/138714 [00:00<?, ?it/s]

  0%|          | 0/138714 [00:00<?, ?it/s]

In [6]:
df_dict['noisy']

Unnamed: 0,source,split,target
138106,GENERATE CHANNEL ONLY WITHOUT FUNCTION <pf> ne...,noisy,Gmail <sep> Google_Calendar
138107,GENERATE CHANNEL ONLY WITHOUT FUNCTION <pf> ma...,noisy,Weather_Underground <sep> Gmail
138108,GENERATE CHANNEL ONLY WITHOUT FUNCTION <pf> te...,noisy,Weather_Underground <sep> SMS
138109,GENERATE CHANNEL ONLY WITHOUT FUNCTION <pf> te...,noisy,Weather_Underground <sep> SMS
138110,GENERATE CHANNEL ONLY WITHOUT FUNCTION <pf> if...,noisy,Weather_Underground <sep> Gmail
...,...,...,...
416137,GENERATE ON THE FIELD-LEVEL GRANULARITY <pf> w...,noisy,RSS_Feed <sep> RSS_Feed.New_feed_item <sep> Fe...
416138,GENERATE ON THE FIELD-LEVEL GRANULARITY <pf> n...,noisy,iOS_Contacts <sep> iOS_Contacts.Any_new_contac...
416139,GENERATE ON THE FIELD-LEVEL GRANULARITY <pf> n...,noisy,RSS_Feed <sep> RSS_Feed.New_feed_item <sep> Fe...
416140,GENERATE ON THE FIELD-LEVEL GRANULARITY <pf> n...,noisy,RSS_Feed <sep> RSS_Feed.New_feed_item <sep> Fe...


In [7]:
# def load_dataset(path_dict=path_dict, number=None):
#     assert(type(path_dict)==dict)
#     df_dict = {}
#     for split, path in path_dict.items():
#         if number:
#             df_dict[split] = pd.read_pickle(path).sample(n=number, random_state=1234).copy()
#         else:
#             df_dict[split] = pd.read_pickle(path)
#     return df_dict

# if DATA_NUM:
#     df_dict = load_dataset(number=DATA_NUM)
# else:
#     df_dict = load_dataset()

# df_dict['train'].info()

In [8]:
# df_dict['train'].head(3)

In [9]:
def convert_to_dataset(df_dict=df_dict):
    train = Dataset.from_pandas(df_dict['train']).remove_columns(['__index_level_0__', 'split'])
    val = Dataset.from_pandas(df_dict['val']).remove_columns(['__index_level_0__', 'split'])
    gold = Dataset.from_pandas(df_dict['gold']).remove_columns(['__index_level_0__', 'split'])
    noisy = Dataset.from_pandas(df_dict['noisy']).remove_columns(['__index_level_0__', 'split'])
    
    return DatasetDict({'train':train,
                        'val':val,
                        'gold':gold,
                        'noisy':noisy})

dataset = convert_to_dataset()

if DEBUG:
    print(dataset.column_names)
    print([dataset['train'][0]])

## load tokenizer

In [10]:
def load_tokenizer(model=MODEL):
    if LOAD_FROM_CKPT:
        tokenizer = RobertaTokenizer.from_pretrained(ckpt)
    else:
        if model == 'roberta':
            tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        elif model == 'codebert':
            tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
        else:
            raise ValueError(f"Undefined model type")
    return tokenizer

tokenizer = load_tokenizer()

In [11]:
def preprocess_function(examples):
    inputs = [ex for ex in examples["source"]]
    targets = [ex for ex in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding=False)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True, padding=False)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

if DEBUG:
    for item in tokenized_datasets['train'][:8]['input_ids']:
        print(item)

  0%|          | 0/410 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

## load model

In [12]:
if LOAD_FROM_CKPT:
    model = EncoderDecoderModel.from_pretrained(ckpt)
    print(f"Loading from {ckpt}")
else:
    model = CustomEncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base", random_decoder=True, model_dict=DECODER_CLASSES)
    print("Loading not from checkpoint")
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.architectures = "EncoderDecoderModel"
model.config.max_length = 100

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading not from checkpoint


## data collator

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
if DEBUG:
    batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
    batch.keys()
    print(batch["labels"])

# metric

In [14]:
bleu = load_metric("sacrebleu")
em = load_metric("exact_match")

def compute_metrics(eval_preds):
    
    def decode_preds(eval_preds):
        preds, labels = eval_preds
        # In case the model returns more than the prediction logits
        if isinstance(preds, tuple):
            preds = preds[0]

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

        # Replace -100s in the labels as we can't decode them
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Some simple post-processing
        decoded_preds = [pred.split("<pf>")[-1].strip() for pred in decoded_preds]
        decoded_labels = [[label.split("<pf>")[-1].strip()] for label in decoded_labels]
        return decoded_preds, decoded_labels
    
    decoded_preds, decoded_labels = decode_preds(eval_preds)
    
    bleu_dict = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    
    # decoded_preds = [pred[0] for pred in decoded_preds]
    decoded_labels = [label[0] for label in decoded_labels]
    em_dict = em.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": bleu_dict["score"],
           "em": em_dict['exact_match'],
           "bleu_em": (bleu_dict['score']+em_dict['exact_match'])/2}

# custom trainer

In [15]:
trainer = CustomTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [16]:
def evaluate_trainer(output_dir, split=None, suffix=None):
    assert(split in tokenized_datasets)
    res = trainer.evaluate(eval_dataset=tokenized_datasets[split])
    print(res)
    
    filename = f"{output_dir}/metrics_{split}.txt" if suffix==None else f"{output_dir}/metrics_{split}_{suffix}.txt" 
    with open(filename, "a") as f:
        json.dump(res, f)
        f.write("\n")
    return res

In [56]:
# res = evaluate_trainer(output_dir=f"{ckpt}",
                       # split="val")

In [17]:
trainer.train()



Step,Training Loss,Validation Loss,Bleu,Em,Bleu Em
500,7.315,3.639773,15.265869,0.0,7.632935
1000,2.9986,1.724522,20.12176,2.65215,11.386955
1500,1.6252,0.941548,52.124041,15.04746,33.58575


Saving model checkpoint to models/rob2rand_merged_w_prefix_2-6-22/best_eval_bleu_em
Configuration saved in models/rob2rand_merged_w_prefix_2-6-22/best_eval_bleu_em/config.json
Model weights saved in models/rob2rand_merged_w_prefix_2-6-22/best_eval_bleu_em/pytorch_model.bin
tokenizer config file saved in models/rob2rand_merged_w_prefix_2-6-22/best_eval_bleu_em/tokenizer_config.json
Special tokens file saved in models/rob2rand_merged_w_prefix_2-6-22/best_eval_bleu_em/special_tokens_map.json
Saving model checkpoint to models/rob2rand_merged_w_prefix_2-6-22/best_eval_bleu_em
Configuration saved in models/rob2rand_merged_w_prefix_2-6-22/best_eval_bleu_em/config.json
Model weights saved in models/rob2rand_merged_w_prefix_2-6-22/best_eval_bleu_em/pytorch_model.bin
tokenizer config file saved in models/rob2rand_merged_w_prefix_2-6-22/best_eval_bleu_em/tokenizer_config.json
Special tokens file saved in models/rob2rand_merged_w_prefix_2-6-22/best_eval_bleu_em/special_tokens_map.json
Saving model

KeyboardInterrupt: 

In [51]:
res = evaluate_trainer(split='val',
                       output_dir=ckpt,
                       suffix="fc")



{'eval_loss': 0.1106535941362381, 'eval_bleu': 80.44846791315841, 'eval_em': 48.80829015544042, 'eval_bleu_em': 64.62837903429941, 'eval_runtime': 77.5655, 'eval_samples_per_second': 37.323, 'eval_steps_per_second': 0.593}


In [52]:
res = evaluate_trainer(split='gold',
                       output_dir=ckpt,
                       suffix="fc")

{'eval_loss': 0.009913727641105652, 'eval_bleu': 97.71500511168604, 'eval_em': 90.49180327868852, 'eval_bleu_em': 94.10340419518728, 'eval_runtime': 8.2387, 'eval_samples_per_second': 37.02, 'eval_steps_per_second': 0.607}


In [53]:
res = evaluate_trainer(split='noisy',
                       output_dir=ckpt,
                       suffix="fc")

{'eval_loss': 0.09393603354692459, 'eval_bleu': 84.45299770854542, 'eval_em': 52.015604681404426, 'eval_bleu_em': 68.23430119497493, 'eval_runtime': 21.0016, 'eval_samples_per_second': 36.616, 'eval_steps_per_second': 0.619}


# push model to the hub

In [54]:
from huggingface_hub import notebook_login

In [55]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [56]:
trainer.args.output_dir = "rob2rand_chen_w_prefix_c_fc"

In [59]:
trainer.model.config

EncoderDecoderConfig {
  "_name_or_path": "models/rob2rand_chen_w_prefix_26-5-22/checkpoint-70000",
  "architectures": "EncoderDecoderModel",
  "decoder": {
    "_name_or_path": "roberta-base",
    "add_cross_attention": true,
    "architectures": [
      "RobertaForMaskedLM"
    ],
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
    "classifier_dropout": null,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": 2,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "initializer_range": 0.02,
    "intermedia

In [60]:
trainer.push_to_hub()

Cloning https://huggingface.co/imamnurby/rob2rand_chen_w_prefix_c_fc into local empty directory.
Saving model checkpoint to rob2rand_chen_w_prefix_c_fc
Configuration saved in rob2rand_chen_w_prefix_c_fc/config.json
Model weights saved in rob2rand_chen_w_prefix_c_fc/pytorch_model.bin
tokenizer config file saved in rob2rand_chen_w_prefix_c_fc/tokenizer_config.json
Special tokens file saved in rob2rand_chen_w_prefix_c_fc/special_tokens_map.json


HBox(children=(HTML(value='Upload file pytorch_model.bin'), FloatProgress(value=0.0, max=1111040466.0), HTML(v…

HBox(children=(HTML(value='Upload file training_args.bin'), FloatProgress(value=0.0, max=3247.0), HTML(value='…

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/imamnurby/rob2rand_chen_w_prefix_c_fc
   c8341d3..aa930dc  main -> main







Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}}
remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/imamnurby/rob2rand_chen_w_prefix_c_fc
   aa930dc..5910bc6  main -> main



'https://huggingface.co/imamnurby/rob2rand_chen_w_prefix_c_fc/commit/aa930dc20280e00ff37eaaca6bd41a3b24b1417d'