In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # nvidia-smi로 비어있는 gpu 확인하고 여기서 선택할것!

In [2]:
import transformers
from evaluate import load
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

import json

import torch
from torch.utils.data import Dataset, DataLoader

import numpy as np

from tqdm import tqdm

import re

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
METRIC_NAME = "bleu"
MODEL_CHECKPOINT = "facebook/bart-large"
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 256

In [5]:
metric = load(METRIC_NAME)


tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


In [6]:
print(tokenizer(text_target=["Hello, this one sentence!", "This is another sentence."]))

{'input_ids': [[0, 31414, 6, 42, 65, 3645, 328, 2], [0, 713, 16, 277, 3645, 4, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}


In [7]:
tokenizer.decode([15483])

'|'

In [8]:
tokenizer.encode("|")

[0, 15483, 2]

In [6]:
with open("test.json") as f:
    raw_datasets = json.load(f)

In [7]:
class bf_Dataset(Dataset):
    def __init__(self, raw_datasets = raw_datasets, train_val_test=0, tokenizer = None):
        if train_val_test == 0:
            self.data = raw_datasets[:int(0.8 * len(raw_datasets))]
        elif train_val_test == 1:
            self.data = raw_datasets[int(0.8 * len(raw_datasets)):int(0.9 * len(raw_datasets))]
        else:
            self.data = raw_datasets[int(0.9 * len(raw_datasets)):]
        self.tokenizer = tokenizer
        self.inputs = []
        self.outputs = []
        for ex in self.data:
            self.inputs.append(ex["info"] + " | " + ex["current"])
            self.outputs.append(ex["response"])
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        embeddings = self.tokenizer(self.inputs[idx], max_length=MAX_INPUT_LENGTH, truncation=True)
        embeddings['labels'] = self.tokenizer(self.outputs[idx], max_length=MAX_TARGET_LENGTH, truncation=True, return_attention_mask=False)["input_ids"]
        return embeddings

class test_Dataset(Dataset):
    def __init__(self, raw_datasets = raw_datasets, train_val_test=0, tokenizer = None):
        if train_val_test == 0:
            self.data = raw_datasets[:int(0.8 * len(raw_datasets))]
        elif train_val_test == 1:
            self.data = raw_datasets[int(0.8 * len(raw_datasets)):int(0.9 * len(raw_datasets))]
        else:
            self.data = raw_datasets[int(0.9 * len(raw_datasets)):]
        self.tokenizer = tokenizer
        self.inputs = []
        self.outputs = []
        for ex in self.data:
            self.inputs.append(ex["info"] + " | " + ex["current"])
            self.outputs.append(ex["response"])
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        embeddings = self.tokenizer(self.inputs[idx], max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt")
        embeddings['labels'] = self.tokenizer(self.outputs[idx], max_length=MAX_TARGET_LENGTH, truncation=True, return_attention_mask=False,return_tensors="pt")["input_ids"]
        return embeddings

In [8]:
train_ds = bf_Dataset(raw_datasets=raw_datasets, train_val_test=0, tokenizer=tokenizer)
val_ds = bf_Dataset(raw_datasets=raw_datasets, train_val_test=1, tokenizer=tokenizer)
test_ds = test_Dataset(raw_datasets=raw_datasets, train_val_test=2, tokenizer=tokenizer)

In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

In [10]:
model_name = MODEL_CHECKPOINT.split("/")[-1]

In [13]:
batch_size = 16
epochs = 6


args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-dailydialogue",
    evaluation_strategy = "steps",
    save_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    save_steps=800,
    eval_steps=800,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
)

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    

    
    rt = {"bleu":0}
    for i,v in enumerate(decoded_labels): 
        result = metric.compute(predictions=[decoded_preds[i]], references=[[v]], max_order = 2)
        rt["bleu"] += result["bleu"]
    rt["bleu"] = rt["bleu"] / len(decoded_labels) * 100
    return rt

In [16]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Bleu
800,2.2194,1.969312,22.118048
1600,1.8735,1.921019,21.940068
2400,1.7144,1.931875,22.712701
3200,1.5569,2.017502,22.828269
4000,1.4052,1.939788,23.114262


TrainOutput(global_step=4656, training_loss=1.6838335187984086, metrics={'train_runtime': 1930.9203, 'train_samples_per_second': 38.565, 'train_steps_per_second': 2.411, 'total_flos': 3.474168347118797e+16, 'train_loss': 1.6838335187984086, 'epoch': 6.0})

In [18]:
assert False

AssertionError: 

In [11]:
model = AutoModelForSeq2SeqLM.from_pretrained(f"{model_name}-finetuned-dailydialogue/checkpoint-4000")

model.eval()


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((102

In [12]:
filename = "./output.txt"
with open(filename,"w") as f:
    pass

pp = 0

with torch.no_grad():
    # suma = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
    model.to(device)
    for i,embeddings in enumerate(test_ds):
        output = model.generate(embeddings["input_ids"].to(device))[0]
        
        pred = tokenizer.decode(output.cpu(), skip_special_tokens=True)
        
        gt = tokenizer.decode(embeddings["labels"][0], skip_special_tokens=True)
        stringt = "input_ids"
        with open(filename,"a") as f:
            f.write(f"{i} 번째 문장\ninput_ids : \n{tokenizer.decode(embeddings[stringt][0],skip_special_tokens=True)}\nprediction: \n{pred}\ngt        : \n{gt}\n\n")

        pp += metric.compute(predictions=[pred], references=[[gt]], max_order = 2)["bleu"]
    pp = pp / len(test_ds) * 100
print("Bleu 2-gram : ",pp)

        

1552it [04:54,  5.27it/s]

Bleu 2-gram :  4.7228476029966355





In [13]:
print(pp)

4.7228476029966355


pp += metric.compute(predictions=["".join(pred.split(":")[1:])], references=[["".join(gt.split(":")[1:])]], max_order = 2)["bleu"]