In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4' # nvidia-smi로 비어있는 gpu 확인하고 여기서 선택할것!

In [2]:
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

import json

import torch
from torch.utils.data import Dataset, DataLoader

import numpy as np

from tqdm import tqdm

import re

from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
MODEL_CHECKPOINT = "facebook/bart-base"
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 512

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [6]:
print(tokenizer(text_target=["Hello, this one sentence!", "This is another sentence."]))

{'input_ids': [[0, 31414, 6, 42, 65, 3645, 328, 2], [0, 713, 16, 277, 3645, 4, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}


In [7]:
train_lists = []
label_lists = []
with open("../bigdata/utterance_persona_dpr.jsonl") as f:
    for json_line in f:
        json_file = json.loads(json_line)
        prev_strs = ""
        dic_list=json_file["extracted_persona_list"]
        for i,dic in enumerate(dic_list):
            prev_strs += dic["utterance"] + "\n" 
            if i!=0 and i!=len(dic_list)-1:
                train_lists.append("History :\n"+prev_strs + "Persona :\n" + dic["3persona"][0][0] + f"\nResponse :\nspeaker {(i+1)%2+1}: ") #[0][1]이 점수
            if i!=0 and i!=1:
                label_lists.append(' '.join(dic["utterance"].split()[2:]))
            
            

In [8]:
lengdata=len(train_lists)

In [9]:
valid_lists = train_lists[int(lengdata*0.7):int(lengdata*0.9)]
valid_label_lists = label_lists[int(lengdata*0.7):int(lengdata*0.9)]

test_lists = train_lists[int(lengdata*0.9):]
test_label_lists = label_lists[int(lengdata*0.9):]

train_lists = train_lists[:int(lengdata*0.7)]
train_label_lists = label_lists[:int(lengdata*0.7)]

In [10]:
class bf_Dataset(Dataset):
    def __init__(self, dataset, labelset, tokenizer):
        self.tokenizer = tokenizer
        self.inputs = []
        self.outputs = []
        for data,label in zip(dataset,labelset):
                self.inputs.append(data)
                self.outputs.append(label)
        
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        embeddings = self.tokenizer(self.inputs[idx], max_length=MAX_INPUT_LENGTH, truncation=True)
        embeddings['labels'] = self.tokenizer(self.outputs[idx], max_length=MAX_TARGET_LENGTH, truncation=True, return_attention_mask=False)["input_ids"]
        return embeddings

In [11]:
train_ds = bf_Dataset(dataset=train_lists, labelset = train_label_lists,tokenizer=tokenizer)
val_ds = bf_Dataset(dataset=valid_lists ,labelset = valid_label_lists ,tokenizer=tokenizer)
test_ds = bf_Dataset(dataset=test_lists ,labelset = test_label_lists ,tokenizer=tokenizer)

In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

In [13]:
model_name = MODEL_CHECKPOINT.split("/")[-1]

In [14]:
batch_size = 16
epochs = 40


args = Seq2SeqTrainingArguments(
    f"./bigdata/{model_name}-finetuned-msc_2-perplexity",
    evaluation_strategy = "steps",
    save_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    save_steps=1600,
    eval_steps=1600,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
)

In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [16]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [17]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
1600,3.1174,2.855793
3200,2.9623,2.823014
4800,2.8568,2.811067
6400,2.7571,2.80496
8000,2.6706,2.817553
9600,2.596,2.834256
11200,2.5268,2.818084
12800,2.4464,2.852538
14400,2.3803,2.875482
16000,2.3216,2.88935


KeyboardInterrupt: 

In [18]:
assert False

AssertionError: 

In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained(f"./bigdata/{model_name}-finetuned-msc_2-perplexity/checkpoint-33600")

model.eval()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05,

In [21]:
from rouge import Rouge
rouge = Rouge()

filename = "./output_persona.txt"
with open(filename,"w") as f:
    pass
pp = 0
pred_arr = []
ref_arr = []
with torch.no_grad():
    model.to(device)
    for i,embeddings in enumerate(test_ds):
        
        output = model.generate(torch.tensor([embeddings["input_ids"]]).to(device))[0]    
        pred = tokenizer.decode(output.cpu(), skip_special_tokens=True)  
        gt = tokenizer.decode(torch.tensor(embeddings["labels"]), skip_special_tokens=True)
    
        
        
        stringt = "input_ids"
        with open(filename,"a") as f:
            f.write(f"{i} 번째 문장\ninput_ids : \n{tokenizer.decode(embeddings[stringt],skip_special_tokens=True)}\nprediction: \n{pred}\ngt        : \n{gt}\n\n")
        pp += rouge.get_scores(pred,gt,avg=True)['rouge-1']['f']


In [22]:
pp /= len(test_ds)
print(pp)

0.26489487094005554


In [21]:
import evaluate
perplexity = evaluate.load("perplexity", module_type = "metric")
input_texts = ["hello"]

results = perplexity.compute(model_id=f"../bigdata/{model_name}-finetuned-msc/checkpoint-20800",
                             add_start_token=False,
                             predictions=input_texts)

print(results)

# pp = 0
# pred_arr = []
# ref_arr = []
# with torch.no_grad():
#     model.to(device)
#     for i,embeddings in enumerate(val_ds):
        
#         output = model.generate(torch.tensor([embeddings["input_ids"]]).to(device))[0]    
#         pred = tokenizer.decode(output.cpu(), skip_special_tokens=True)  
#         gt = tokenizer.decode(torch.tensor(embeddings["labels"]), skip_special_tokens=True)
        
#         pp += rouge.get_scores(pred,gt,avg=True)['rouge-1']['f']


Some weights of the model checkpoint at ../bigdata/bart-base-finetuned-msc/checkpoint-20800 were not used when initializing BartForCausalLM: ['model.encoder.layers.0.fc2.bias', 'model.encoder.layers.2.self_attn.v_proj.weight', 'model.encoder.layers.5.self_attn.q_proj.bias', 'model.encoder.layers.5.self_attn.out_proj.bias', 'model.encoder.layers.4.fc1.weight', 'model.encoder.layers.1.fc1.weight', 'model.encoder.layers.4.fc2.bias', 'model.encoder.layers.5.final_layer_norm.bias', 'model.encoder.layers.1.final_layer_norm.bias', 'model.encoder.layers.4.final_layer_norm.bias', 'model.encoder.layers.2.self_attn.out_proj.weight', 'model.encoder.layers.4.self_attn_layer_norm.weight', 'model.encoder.layers.1.fc2.bias', 'model.encoder.layers.1.self_attn.k_proj.weight', 'model.encoder.layers.1.self_attn.q_proj.bias', 'model.encoder.layers.0.fc1.weight', 'model.encoder.layers.1.self_attn.v_proj.bias', 'model.encoder.layers.3.fc2.weight', 'model.encoder.layers.0.self_attn.v_proj.weight', 'model.enco

AssertionError: When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings.

In [15]:
print(results)

{'perplexities': [24909.056640625], 'mean_perplexity': 24909.056640625}


In [19]:
print(type(valid_lists))
print(valid_lists[0])

<class 'list'>
speaker 2: thanks! i'll take a look. what makes it so special? speaker 1: well, it's the marinade i use. it's bourbon, soy sauce, and brown sugar. so tasty! speaker 2: i could really go for a steak tonight with a nice beer on the side. i'll probably cook mine a bit more than yours though. speaker 1: i like steak to be medium rare. what is your preference? speaker 2: lol. we just had an argument about this last week. i'm a well done kind of guy. don't want to worry about my steak running away  Speaker 1 prefers medium rare meat


In [22]:
results = perplexity.compute(model_id=f"../bigdata/{model_name}-finetuned-msc/checkpoint-20800",
                             #add_start_token=False,
                             predictions=valid_lists[:10])


Some weights of the model checkpoint at ../bigdata/bart-base-finetuned-msc/checkpoint-20800 were not used when initializing BartForCausalLM: ['model.encoder.embed_tokens.weight', 'model.encoder.layers.4.self_attn.v_proj.bias', 'model.encoder.layers.2.self_attn.q_proj.bias', 'model.encoder.layers.1.self_attn.k_proj.bias', 'model.encoder.layers.4.self_attn_layer_norm.bias', 'model.encoder.layers.1.self_attn.v_proj.weight', 'model.encoder.layers.0.self_attn.k_proj.weight', 'model.encoder.layers.2.final_layer_norm.bias', 'model.encoder.layers.3.self_attn.v_proj.bias', 'model.encoder.layers.5.final_layer_norm.weight', 'model.encoder.layers.4.final_layer_norm.weight', 'model.encoder.layers.4.self_attn.q_proj.bias', 'model.encoder.layers.3.self_attn_layer_norm.weight', 'model.encoder.layers.5.self_attn.v_proj.bias', 'model.encoder.layers.3.self_attn.q_proj.weight', 'model.encoder.layers.0.final_layer_norm.bias', 'model.encoder.layers.1.final_layer_norm.bias', 'model.encoder.layers.2.self_attn

In [23]:
print(results)

{'perplexities': [303787.4375, 341211.3125, 358140.65625, 437049.4375, 404786.71875, 358234.90625, 95590.4140625, 155967.09375, 112758.9921875, 107606.15625], 'mean_perplexity': 267513.3125}


In [31]:
results = perplexity.compute(model_id=f"gpt2",
                             #add_start_token=False,
                             predictions=["what is capital city of USA?"])

Using pad_token, but it is not set yet.
100%|██████████| 1/1 [00:00<00:00, 79.19it/s]


In [32]:
print(results)

{'perplexities': [222.55404663085938], 'mean_perplexity': 222.55404663085938}


In [22]:
import math
import statistics
losses = []
model.cuda()
for i, embeddings in enumerate(val_ds):
    with torch.no_grad():
        input_ids = torch.tensor([embeddings["input_ids"]]).to(device)
        labels = torch.tensor([embeddings["labels"]]).to(device)
        outputs = model(input_ids = input_ids, labels = labels)
        losses.append(outputs.loss.item())
ppl = math.exp(statistics.mean(losses))

In [23]:
print(ppl)

21.468618038575297
