In [1]:
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    AutoModelForSeq2SeqLM,
)
from peft import get_peft_model, LoraConfig, TaskType
import numpy as np
import sacrebleu

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_old = load_dataset("KomeijiForce/Text2Emoji")

ROW_NUMBER = 60000
VALIDATION_SIZE = (ROW_NUMBER * 20) // 100

In [3]:
sliced_train_dataset = dataset_old["train"].select(range(ROW_NUMBER))

dataset = dataset_old.copy()
dataset["train"] = sliced_train_dataset
dataset

{'train': Dataset({
     features: ['text', 'emoji', 'topic'],
     num_rows: 60000
 })}

In [4]:
dct = {"text": [], "emoji": [], "topic": []}
index = 0
with open("./data/gpt_translate_2.txt", 'r', encoding='utf-8') as file:
    for line in file:
        inx = line.rfind(",")
        text, emoji = line[:inx], line[inx+1:]
        dct["text"].append(text.strip())
        dct["emoji"].append(emoji.strip())
        dct['topic'].append("None")
        index += 1
        if index >= 5000:
          break

In [5]:
print(len(dct['text']))

5000


In [6]:
extension_data = Dataset.from_dict(dct)
extended_dataset = concatenate_datasets([extension_data, dataset['train']])
dataset['train'] = extended_dataset
dataset

{'train': Dataset({
     features: ['text', 'emoji', 'topic'],
     num_rows: 65000
 })}

In [7]:
NEW_ROW = dataset["train"].num_rows
VALIDATION = (NEW_ROW * 20) // 100

In [8]:
def transform_features(example):
    return {
        "output": example["emoji"],
        "input": example["text"]
    }

transformed_train = dataset["train"].map(transform_features, remove_columns=["topic", "emoji", 'text'])
train_test_split = transformed_train.shuffle(seed=42).train_test_split(test_size=VALIDATION)

final_data = DatasetDict({
    "train": train_test_split["train"].select(range(NEW_ROW - VALIDATION)),
    "validation": train_test_split["test"]
})

print(final_data)

DatasetDict({
    train: Dataset({
        features: ['output', 'input'],
        num_rows: 52000
    })
    validation: Dataset({
        features: ['output', 'input'],
        num_rows: 13000
    })
})


### Tokenizer training

In [9]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

In [10]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

model = get_peft_model(model, lora_config)

In [11]:
model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 407,471,104 || trainable%: 0.2895


In [5]:
tokenizer_input = BartTokenizer.from_pretrained('facebook/bart-large')
tokenizer_output = BartTokenizer.from_pretrained('facebook/bart-large')



In [13]:
tokenizer_output.pad_token = tokenizer_input.pad_token

In [14]:
final_data = final_data.filter(lambda example: all(value is not None for value in example.values()))

Filter: 100%|██████████| 52000/52000 [00:00<00:00, 152227.61 examples/s]
Filter: 100%|██████████| 13000/13000 [00:00<00:00, 166516.37 examples/s]


In [15]:
def tokenize_dataset(sample):
    max_length = 64
    input = tokenizer_input(
        sample["input"], padding="max_length", max_length=max_length, truncation=True
    )
    label_tokens = [
        tokenizer_output.encode(
            " ".join(list(output_str)),
            padding="max_length",
            max_length=max_length,
            truncation=True,
        )
        for output_str in sample["output"]
    ]
    input["labels"] = label_tokens
    return input


In [16]:
shuffled_dataset = final_data.shuffle(seed=42)
tokenized_dataset = shuffled_dataset.map(tokenize_dataset, batched=True)

Map: 100%|██████████| 52000/52000 [00:08<00:00, 5936.49 examples/s]
Map: 100%|██████████| 13000/13000 [00:02<00:00, 5695.57 examples/s]


In [17]:
cleared_dataset = tokenized_dataset.map(lambda x: x, remove_columns=["output", 'input'])

Map: 100%|██████████| 52000/52000 [00:01<00:00, 42508.95 examples/s]
Map: 100%|██████████| 13000/13000 [00:00<00:00, 35213.32 examples/s]


In [18]:
cleared_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 52000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13000
    })
})

In [19]:
len(cleared_dataset["train"][0]["input_ids"]), len(cleared_dataset["train"][0]["labels"])

(64, 64)

In [None]:
def compute_metrics_factory(tokenizer):
    """Return a metrics function for Seq2SeqTrainer."""

    def _postprocess(preds, labels):
        preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        return preds, labels

    def _compute(eval_preds):
        preds, labels = eval_preds
        preds, labels = _postprocess(preds, labels)

        seq_correct = np.sum([p == l for p, l in zip(preds, labels)])
        seq_acc = seq_correct / len(preds)

        tok_correct, tok_total = 0, 0
        for p, l in zip(preds, labels):
            p_toks, l_toks = list(p), list(l)
            tok_total += max(len(p_toks), len(l_toks))
            tok_correct += sum(pt == lt for pt, lt in zip(p_toks, l_toks))
        tok_acc = tok_correct / tok_total if tok_total else 0.0
        bleu = (
            sacrebleu.corpus_bleu(
                [" ".join(list(p)) for p in preds],
                [[" ".join(list(l)) for l in labels]],
                smooth_method="exp",
            ).score
            / 100.0
        )

        return {"seq_exact": seq_acc, "tok_acc": tok_acc, "bleu4": bleu}

    return _compute

In [3]:
save_directory = "./bart_finetuned_15k_samples"
device = "cuda"
model = AutoModelForSeq2SeqLM.from_pretrained(save_directory).to(device)

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    evaluation_strategy="steps",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    logging_steps=1000,
    save_steps=500,
    eval_steps=8000,
    warmup_steps=2000,
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=cleared_dataset["train"],
    eval_dataset=cleared_dataset["validation"],
    compute_metrics=compute_metrics_factory(tokenizer_output)
)



In [23]:
trainer.evaluate(cleared_dataset["validation"])

100%|██████████| 6500/6500 [25:12<00:00,  4.30it/s]


{'eval_loss': 4.837183475494385,
 'eval_model_preparation_time': 0.0064,
 'eval_seq_exact': 7.692307692307693e-05,
 'eval_tok_acc': 0.30691555795595377,
 'eval_bleu4': 0.03738506932347838,
 'eval_runtime': 1513.5918,
 'eval_samples_per_second': 8.589,
 'eval_steps_per_second': 4.294}

In [8]:
input_text = "Travelling around the world."
inputs = tokenizer_input(input_text, return_tensors="pt").to(device)
outputs = model.generate(
    inputs.input_ids.to(device), max_length=50, temperature=0.7, do_sample=True
)
outputs = outputs.cpu()
output_ids = outputs[0].tolist()

print("Input:")
print(input_text)
generated_text = tokenizer_output.decode(output_ids, skip_special_tokens=True)
generated_text = "".join([text.strip() for text in generated_text.split(" ")])
print("Generated Output:")
print(generated_text)

Input:
Travelling around the world.
Generated Output:
🚶‍♀️🌍🌆🌇
