In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # nvidia-smi로 비어있는 gpu 확인하고 여기서 선택할것!

In [2]:
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

import json

import torch
from torch.utils.data import Dataset, DataLoader

import numpy as np

from tqdm import tqdm

import re

from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
MODEL_CHECKPOINT = "facebook/bart-large"
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 512

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [8]:
print(tokenizer(text_target=["Hello, this one sentence!", "This is another sentence."]))

{'input_ids': [[0, 31414, 6, 42, 65, 3645, 328, 2], [0, 713, 16, 277, 3645, 4, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}


In [9]:
train_lists = []
with open("./data/train_10.jsonl") as f:
    for json_line in f:
        json_file = json.loads(json_line)
        data_list = []
        if len(json_file["previous_dialogs"]) > 1:
            print("many history")

        prev_dic_list=json_file["previous_dialogs"][0]["dialog"]
        for i,dic in enumerate(prev_dic_list):
            if i%2==0:
                data_list.append("Speaker 1: "+dic["text"])
            else:
                data_list.append("Speaker 2: "+dic["text"])

        curr_dic_list = json_file["dialog"]
        for i,dic in enumerate(curr_dic_list):
            data_list.append(dic["id"]+" "+dic["text"])
    train_lists.append(data_list)


In [10]:
val_lists = []
with open("./data/val_10.jsonl") as f:
    for json_line in f:
        json_file = json.loads(json_line)
        data_list = []
        if len(json_file["previous_dialogs"]) > 1:
            print("many history")

        prev_dic_list=json_file["previous_dialogs"][0]["dialog"]
        for i,dic in enumerate(prev_dic_list):
            if i%2==0:
                data_list.append("Speaker 1: "+dic["text"])
            else:
                data_list.append("Speaker 2: "+dic["text"])

        curr_dic_list = json_file["dialog"]
        for i,dic in enumerate(curr_dic_list):
            data_list.append(dic["id"]+" "+dic["text"])
    val_lists.append(data_list)


In [11]:
test_lists = []
with open("./data/test_10.jsonl") as f:
    for json_line in f:
        json_file = json.loads(json_line)
        data_list = []
        if len(json_file["previous_dialogs"]) > 1:
            print("many history")

        prev_dic_list=json_file["previous_dialogs"][0]["dialog"]
        for i,dic in enumerate(prev_dic_list):
            if i%2==0:
                data_list.append("Speaker 1: "+dic["text"])
            else:
                data_list.append("Speaker 2: "+dic["text"])

        curr_dic_list = json_file["dialog"]
        for i,dic in enumerate(curr_dic_list):
            data_list.append(dic["id"]+" "+dic["text"])
    test_lists.append(data_list)

In [12]:
class bf_Dataset(Dataset):
    def __init__(self, dataset,  tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.inputs = []
        self.outputs = []
        for dialog in self.dataset:
            for turn_number in range(1,len(dialog)):
                input_string = ""
                for input_turns in dialog[:turn_number]:
                    input_string += input_turns.strip() + " "
                self.inputs.append(input_string)
                self.outputs.append(dialog[turn_number].strip())
        
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        embeddings = self.tokenizer(self.inputs[idx], max_length=MAX_INPUT_LENGTH, truncation=True)
        embeddings['labels'] = self.tokenizer(self.outputs[idx], max_length=MAX_TARGET_LENGTH, truncation=True, return_attention_mask=False)["input_ids"]
        return embeddings

In [13]:
train_ds = bf_Dataset(dataset=train_lists, tokenizer=tokenizer)
val_ds = bf_Dataset(dataset=val_lists , tokenizer=tokenizer)
test_ds = bf_Dataset(dataset=test_lists , tokenizer=tokenizer)

In [14]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

In [15]:
model_name = MODEL_CHECKPOINT.split("/")[-1]

In [14]:
batch_size = 8
epochs = 600


args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-msc",
    evaluation_strategy = "steps",
    save_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    save_steps=80,
    eval_steps=80,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
)

In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [16]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [17]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
80,No log,0.225304
160,No log,0.302731
240,No log,0.287855
320,No log,0.196573
400,No log,0.110708
480,No log,0.07225
560,0.275900,0.087158
640,0.275900,0.124891
720,0.275900,0.075046
800,0.275900,0.066771


TrainOutput(global_step=2400, training_loss=0.06646799753109614, metrics={'train_runtime': 1137.8178, 'train_samples_per_second': 13.183, 'train_steps_per_second': 2.109, 'total_flos': 1.5314303910764544e+16, 'train_loss': 0.06646799753109614, 'epoch': 600.0})

In [18]:
assert False

AssertionError: 

In [16]:
model = AutoModelForSeq2SeqLM.from_pretrained(f"{model_name}-finetuned-msc/checkpoint-2400")

model.eval()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((102

In [17]:
filename = "./output.txt"
with open(filename,"w") as f:
    pass
pp = 0
pred_arr = []
ref_arr = []
with torch.no_grad():
    model.to(device)
    for i,embeddings in enumerate(test_ds):
        
        output = model.generate(torch.tensor([embeddings["input_ids"]]).to(device))[0]    
        pred = tokenizer.decode(output.cpu(), skip_special_tokens=True)  
        gt = tokenizer.decode(torch.tensor(embeddings["labels"]), skip_special_tokens=True)
    
        
        
        stringt = "input_ids"
        with open(filename,"a") as f:
            f.write(f"{i} 번째 문장\ninput_ids : \n{tokenizer.decode(embeddings[stringt],skip_special_tokens=True)}\nprediction: \n{pred}\ngt        : \n{gt}\n\n")




In [None]:
print(pp)