In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1' # nvidia-smi로 비어있는 gpu 확인하고 여기서 선택할것!

In [3]:
import transformers
from evaluate import load
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorWithPadding,TrainingArguments, Trainer

import json

import torch
from torch.utils.data import Dataset, DataLoader

import numpy as np

from tqdm import tqdm

import re

from datasets import load_dataset


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
METRIC_NAME = "bleu"
MODEL_CHECKPOINT = "microsoft/DialoGPT-large"
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 256

In [6]:
metric = load(METRIC_NAME)


tokenizer = GPT2Tokenizer.from_pretrained(MODEL_CHECKPOINT)


In [7]:
raw_datasets = load_dataset("daily_dialog")

Found cached dataset daily_dialog (/home/leadawon5/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
100%|██████████| 3/3 [00:00<00:00, 396.80it/s]


In [8]:
class bf_Dataset(Dataset):
    def __init__(self, raw_datasets = raw_datasets, train_val_test=0, tokenizer = None):
        if train_val_test == 0:
            self.data = raw_datasets['train']["dialog"]
        elif train_val_test == 1:
            self.data = raw_datasets['validation']["dialog"]
            
        else:
            self.data = raw_datasets['test']["dialog"]
        
        self.tokenizer = tokenizer
        self.inputs = []
        self.outputs = []
        for dg in self.data:
            for turns in range(len(dg)//2,len(dg)):
                input_string = ""
                for input_turns in dg[:turns]:
                    input_string += input_turns.strip() + " "
                self.inputs.append(input_string)
                self.outputs.append(dg[turns].strip())
        
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        embeddings = self.tokenizer(self.inputs[idx], max_length=MAX_INPUT_LENGTH, truncation=True)
        embeddings['labels'] = self.tokenizer(self.outputs[idx], max_length=MAX_TARGET_LENGTH, truncation=True, return_attention_mask=False)["input_ids"]
        return embeddings

class test_Dataset(Dataset):
    def __init__(self, raw_datasets = raw_datasets, train_val_test=0, tokenizer = None):
        self.data = raw_datasets['test']["dialog"]
        
        self.tokenizer = tokenizer
        self.inputs = []
        self.outputs = []
        for dg in self.data:
            for turns in range(len(dg)//2,len(dg)):
                input_string = ""
                for input_turns in dg[:turns]:
                    input_string += input_turns.strip() + " "
                self.inputs.append(input_string)
                self.outputs.append(dg[turns].strip())
        
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        embeddings = self.tokenizer(self.inputs[idx], max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt")
        embeddings['labels'] = self.tokenizer(self.outputs[idx], max_length=MAX_TARGET_LENGTH, truncation=True, return_attention_mask=False,return_tensors="pt")["input_ids"]
        return embeddings

In [9]:
train_ds = bf_Dataset(raw_datasets=raw_datasets, train_val_test=0, tokenizer=tokenizer)
val_ds = bf_Dataset(raw_datasets=raw_datasets, train_val_test=1, tokenizer=tokenizer)
test_ds = test_Dataset(raw_datasets=raw_datasets, train_val_test=2, tokenizer=tokenizer)

In [10]:
model = GPT2LMHeadModel.from_pretrained(MODEL_CHECKPOINT)

In [11]:
model_name = MODEL_CHECKPOINT.split("/")[-1]

In [13]:
batch_size = 8
epochs = 4


args = TrainingArguments(
    f"{model_name}-finetuned-dailydialogue-vanila",
    evaluation_strategy = "steps",
    save_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    save_steps=3000,
    eval_steps=3000,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=epochs,
    fp16=True,
)

In [19]:
tokenizer.pad_token = tokenizer.eos_token

In [20]:
data_collator = DataCollatorWithPadding(tokenizer)

In [16]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    

    
    rt = {"bleu":0}
    for i,v in enumerate(decoded_labels): 
        result = metric.compute(predictions=[decoded_preds[i]], references=[[v]], max_order = 2)
        rt["bleu"] += result["bleu"]
    rt["bleu"] = rt["bleu"] / len(decoded_labels) * 100
    return rt

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [22]:
trainer.train()

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [15]:
assert False

AssertionError: 

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(f"{model_name}-finetuned-dailydialogue-vanila/checkpoint-21000")

model.eval()


In [None]:
filename = "./output_dd.txt"
with open(filename,"w") as f:
    pass

pp = 0

with torch.no_grad():
    # suma = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
    model.to(device)
    for i,embeddings in enumerate(test_ds):
        output = model.generate(embeddings["input_ids"].to(device))[0]
        
        pred = tokenizer.decode(output.cpu(), skip_special_tokens=True)
        
        gt = tokenizer.decode(embeddings["labels"][0], skip_special_tokens=True)
        stringt = "input_ids"
        with open(filename,"a") as f:
            f.write(f"{i} 번째 문장\ninput_ids : \n{tokenizer.decode(embeddings[stringt][0],skip_special_tokens=True)}\nprediction: \n{pred}\ngt        : \n{gt}\n\n")

        pp += metric.compute(predictions=[pred], references=[[gt]], max_order = 2)["bleu"]
    pp = pp / len(test_ds) * 100
print("Bleu 2-gram : ",pp)

        

In [None]:
print(pp)