## load pretrained summary model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/WARNING_PRIVATE_FOLDER/gpt2-dialogue-generation-pytorch/
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.7.1
  Downloading torch-1.7.1-cp39-cp39-manylinux1_x86_64.whl (776.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.8/776.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.12.5
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==1.16.1
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.3/298.3 KB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece==0.1.96
  Downloading sentencepiece-0.1.96-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m


In [None]:
import json
import torch

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
from transformers import pipeline
summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")

In [None]:
conversation = '''Jeff: Can I train a Transformers model on Amazon SageMaker? 
Philipp: Sure you can use the new Hugging Face Deep Learning Container. 
Jeff: ok.
Jeff: and how can I get started? 
Jeff: where can I find documentation? 
Philipp: ok, ok you can find everything here.                                   
'''
summarizer(conversation)[0]["summary_text"]

## dataset devset test - bart

In [None]:
from datasets import *
from tqdm import tqdm


# For all
space = 'Ġ'
pre_quote = '’'
end_marks = ['.', ',', '?', '!', '...']
quotes = ['"', '\'']
abbreviations = ['s', 'd', 't', 'm', 're', 'll', 've', 'S', 'D', 'T', 'M', 'Re', 'Ll', 'Ve']

# For empathetic dialogues
exclude_symbol = "_conv"
comma_symbol = "_comma_"

# For persona chat
persona_chat_url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
silence_symbol = "__ SILENCE __"


def load_daily():
    dataset = load_dataset('daily_dialog')
    test_dialogues = dataset['test']['dialog']
    
    return test_dialogues
    
    

def process_token_list(token_list):
    token_list[0] = token_list[0].capitalize()
    
    quote_count = 0
    for i, token in enumerate(token_list):
        if space in token:
            if token[1:] in end_marks or token[1:] in abbreviations:
                token_list[i] = token[1:]
                
            if token[1:] == quotes[1]:
                if i<len(token_list)-1:
                    if token_list[i+1] in abbreviations or (token_list[i+1][0] == space and token_list[i+1][1:] in abbreviations):
                        token_list[i] = token[1:]
                        
        if token[0] == space and token[1:] in quotes:
            if quote_count % 2 == 1:
                token_list[i] = token[1:]
                quote_count = 0
            else:
                if i<len(token_list)-1 and token_list[i+1][0] == space:
                    token_list[i+1] = token_list[i+1][1:]
                quote_count += 1
                
        if token in end_marks or token[1:] in end_marks:
            if i<len(token_list)-1:
                if token_list[i+1][0] != space:
                    token_list[i+1] = space + token_list[i+1].capitalize()
                else:
                    token_list[i+1] = space + token_list[i+1][1:].capitalize()
                
    new_token_list = [token for token in token_list if token != space and len(token)>0]
    if new_token_list[-1] not in end_marks:
        new_token_list.append(end_marks[0])
        
    return new_token_list


In [None]:
test_arr = load_daily()
load_daily()

In [None]:
import difflib
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_polynomial_decay_schedule_with_warmup

from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter
from itertools import chain

import torch
import os, sys
import numpy as np
import argparse
import copy
import math
import random

class Arguments:
    def __init__(self):
        self.seed = 0 
        self.mode="test" 
        self.data_dir="data" 
        self.model_type="gpt2" 
        self.bos_token="<bos>" 
        self.sp1_token="<sp1>" 
        self.sp2_token="<sp2>" 
        self.gpu="0" 
        self.max_len=1024 
        self.max_turns=5 
        self.top_p=0.8 
        self.ckpt_dir="saved_models" 
        self.ckpt_name="best_ckpt_epoch=11_valid_loss=2.6479" 
        self.end_command="Abort!"



#원래 shell로 들어가는 파라미터를 정의합니다.

class Manager():
    def __init__(self, args, test_arr):
        self.args = args
        self.test_arr = test_arr

        if torch.cuda.is_available():
            self.args.device = torch.device(f"cuda:{self.args.gpu}")
        else:
            self.args.device = torch.device("cpu")
        
        # Tokenizer & Vocab
        print("Loading the tokenizer...")
        self.tokenizer = GPT2Tokenizer.from_pretrained(self.args.model_type)
        special_tokens = {
            'bos_token': self.args.bos_token,
            'additional_special_tokens': [self.args.sp1_token, self.args.sp2_token]
        }
        self.args.eos_token = self.tokenizer.eos_token
        num_new_tokens = self.tokenizer.add_special_tokens(special_tokens)
        vocab = self.tokenizer.get_vocab()
        self.args.vocab_size = len(vocab)
        self.args.bos_id = vocab[self.args.bos_token]
        self.args.eos_id = vocab[self.args.eos_token]
        self.args.sp1_id = vocab[self.args.sp1_token]
        self.args.sp2_id = vocab[self.args.sp2_token]
        
        # Load model    
        print("Loading the model...")
        self.fix_seed(self.args.seed)
        self.model = GPT2LMHeadModel.from_pretrained(self.args.model_type).to(self.args.device)
        self.model.resize_token_embeddings(self.args.vocab_size)
        
        self.args.max_len = min(self.args.max_len, self.model.config.n_ctx)
            
        
        
        if self.args.ckpt_name is not None:
            ckpt_path = f"{self.args.ckpt_dir}/{self.args.ckpt_name}.ckpt"
            if os.path.exists(ckpt_path):
                print("Loading the trained checkpoint...")
                ckpt = torch.load(ckpt_path, map_location=self.args.device)
                self.model.load_state_dict(ckpt['model_state_dict'])
                
                if self.args.mode == 'train':
                    print(f"The training restarts with the specified checkpoint: {self.args.ckpt_name}.ckpt.")
                    self.optim.load_state_dict(ckpt['optim_state_dict'])
                    self.sched.load_state_dict(ckpt['sched_state_dict'])
                    self.best_loss = ckpt['loss']
                    self.last_epoch = ckpt['epoch']
                else:
                    print("The inference will start with the specified checkpoint.")
            else:
                print(f"Cannot fine the specified checkpoint {ckpt_path}.")
                if self.args.mode == 'train':
                    print("Training will start with the initialized model.")
                else:
                    print("Cannot inference.")
                    exit()
              
        print("Setting finished.")
        
    def nucleus_sampling(self, input_ids, token_type_ids, input_len):
        output_ids = []
        for pos in range(input_len, self.args.max_len):
            output = self.model(input_ids=input_ids, token_type_ids=token_type_ids)[0][:, pos-1]  # (1, V)
            output = F.softmax(output, dim=-1)  # (1, V)
            
            sorted_probs, sorted_idxs = torch.sort(output, descending=True)
            cumsum_probs = torch.cumsum(sorted_probs, dim=-1)  # (1, V)
            idx_remove = cumsum_probs > self.args.top_p
            idx_remove[:, 1:] = idx_remove[:, :-1].clone()
            idx_remove[:, 0] = False
            sorted_probs[idx_remove] = 0.0
            sorted_probs /= torch.sum(sorted_probs, dim=-1, keepdim=True)  # (1, V)
            
            probs = torch.zeros(output.shape, device=self.args.device).scatter_(-1, sorted_idxs, sorted_probs)  # (1, V)
            idx = torch.multinomial(probs, 1)  # (1, 1)
            
            idx_item = idx.squeeze(-1).squeeze(-1).item()
            output_ids.append(idx_item)
            
            if idx_item == self.args.eos_id:
                break
                
            input_ids = torch.cat((input_ids, idx), dim=-1)
            next_type_id = torch.LongTensor([[self.args.sp2_id]]).to(self.args.device)
            token_type_ids = torch.cat((token_type_ids, next_type_id), dim=-1)
            assert input_ids.shape == token_type_ids.shape
            
        return output_ids

    def fix_seed(self, seed):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)  

    def test(self):
        test_arr = self.test_arr
        print("Let's start!")
        self.model.eval()
        self.fix_seed(self.args.seed)

        summarized_utter = []
        user = []
        ground_truth = []

        context_for_json = []
        ans_for_json = []
        gt_for_json = []

        for utters in test_arr:
            user.append(utters[-2]) #user utter
            ground_truth.append(utters[-1]) #ground truth 진짜 정답.

            ## 여기서부터
        
        with torch.no_grad():
            input_hists = []
            ex_cnt = -1
            similarity = 0
            for utters in test_arr:
                #utter = input("You: ")
                ex_cnt += 1
                if ex_cnt == len(test_arr):
                    break
                
                ## summarized input ##
                input_hists = []
                if len(utters) >= 3:
                    string_input = ' '.join(utters[:-2])
                    sumrz_input = summarizer(string_input , max_length= 1 * len(string_input) // 6 , min_length=1 * len(string_input) // 12)[0]["summary_text"]
                else:
                    string_input = ""
                    sumrz_input = ""    

                context_for_json.append(string_input)

                summarized_utter.append(sumrz_input)

                sumrz_input_ids = [self.args.sp1_id] + self.tokenizer.encode(sumrz_input) # sp1 sumar.. utter
                input_hists.append(sumrz_input_ids)

                ## sumarized utter를 하나씩 꺼냅니다. ##

                input_ids = [self.args.sp2_id] + self.tokenizer.encode(utters[-2]) # sp2 user utter
                input_hists.append(input_ids)
                
                #if len(input_hists) >= self.args.max_turns:
                    #num_exceeded = len(input_hists) - self.args.max_turns + 1
                    #input_hists = input_hists[num_exceeded:]
                # 역시 턴 개념은 사용하지 않습니다.
                    
                input_ids = [self.args.bos_id] + list(chain.from_iterable(input_hists)) + [self.args.sp1_id] # 2 -> 1
                #start_sp_id = input_hists[0][0]
                start_sp_id = self.args.sp1_id
                


                #next_sp_id = self.args.sp1_id if start_sp_id == self.args.sp2_id else self.args.sp2_id
                next_sp_id = self.args.sp2_id

                assert start_sp_id != next_sp_id
                token_type_ids = [[start_sp_id] * len(hist) if h % 2 == 0 else [next_sp_id] * len(hist) for h, hist in enumerate(input_hists)] 
                assert len(token_type_ids) == len(input_hists)
                token_type_ids = [start_sp_id] + list(chain.from_iterable(token_type_ids)) + [self.args.sp1_id] # 2 -> 1
                assert len(input_ids) == len(token_type_ids)
                input_len = len(input_ids)
                
                input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(self.args.device)
                token_type_ids = torch.LongTensor(token_type_ids).unsqueeze(0).to(self.args.device)
                
                output_ids = self.nucleus_sampling(input_ids, token_type_ids, input_len)
           
                # output_ids = self.model.generate(
                #     input_ids=input_ids, token_type_ids=token_type_ids, pad_token_id=self.args.eos_id,
                #     do_sample=True, top_p=self.args.top_p, max_length=self.args.max_len,
                #     output_hidden_states=True, output_scores=True, return_dict_in_generate=True,
                # ).sequences
                # output_ids = output_ids[0].tolist()[input_len:]
                res = self.tokenizer.decode(output_ids, skip_special_tokens=True)

                similarity += compute_similarity(res, ground_truth[ex_cnt])

                print(f"summarized : {summarized_utter[ex_cnt]}")
                print(f"user : {user[ex_cnt]}")
                print(f"res : {res}\n gt : {ground_truth[ex_cnt]}")

                context_for_json[-1] += user[ex_cnt]
                ans_for_json.append(res)

                assert len(context_for_json) == len(ans_for_json)
                
                if len(context_for_json) % 10==0:
                    print("#"*50)
                    print(len(context_for_json))
                    print("#"*50)

                if len(context_for_json) == 600:
                    # 데이터 딕셔너리 생성
                    data = {"context": context_for_json, "ans": ans_for_json}
                    gt = {"ans":ground_truth}
                    # JSON 파일로 저장
                    with open("test/json/bart_11e.json", "w") as f:
                        json.dump(data, f)
                    with open("test/json/gt.json","w") as f:
                        json.dump(gt, f)

                    break

                # 예측한 문장과 ground truth를 비교할 수 있습니다.
                # 아직 눈으로 밖에 비교할 방법이 없음.

                #print(f"Bot: {res}")
                #input_hists.append([self.args.sp2_id] + self.tokenizer.encode(res))
        print(f"문자열 간에 유사도 : {similarity / len(test_arr)}")




      

In [None]:
args = Arguments()
args.ckpt_dir = f"{args.ckpt_dir}/{args.model_type}"
assert args.ckpt_name is not None, "Please specify the trained model checkpoint."
manager = Manager(args, test_arr)
manager.test()

Loading the tokenizer...


Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Loading the model...


Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Loading the trained checkpoint...
The inference will start with the specified checkpoint.
Setting finished.
Let's start!
summarized : Man wants to buy weed. He also has blow and he has dope and acid. He will give him one ounce of weed for free. He has connections to get all these drugs. 
user :  Yeah ? 
res : Yeah, I know. The rest is out of my control. I could blow $200 and get free pills.
 gt :  I want you to put your hands behind your head ! You are under arrest ! 
summarized : Taxi drivers
user :  They want the government to reduce the price of the gasoline . 
res : That's right.
 gt :  It is really a hot potato . 
summarized : The factory has reduced its energy consumption by 15% in the last two years thanks to a heat recovery system. The system uses the exhaust gases from the printing presses to heat the dryers.
user :  What other sources of energy do you use ? 
res : Most of the imports are of paper products.
 gt :  We don't use any fossil fuels . Most of our power comes from hy

## dataset devset test - text rank - 이거말고 textrank-test.ipynb 이용할것

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
def textrank(conversation):
    notbadlen = len(conversation) // 2
    # 문장 토큰화
    sentences = sent_tokenize(conversation)

    # 불용어 제거 및 어간 추출
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # 전처리
    def preprocess(sentence):
        tokens = word_tokenize(sentence.lower())
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
        return tokens

    sentences_tokenized = [preprocess(sentence) for sentence in sentences]

    word_set = set(word for sentence in sentences_tokenized for word in sentence)
    word_to_idx = {word: i for i, word in enumerate(word_set)}

    word_vectors = np.zeros((len(word_set), len(word_set)))
    for sentence in sentences_tokenized:
        for i in range(len(sentence)):
            for j in range(i+1, len(sentence)):
                word_i_idx, word_j_idx = word_to_idx[sentence[i]], word_to_idx[sentence[j]]
                word_vectors[word_i_idx, word_j_idx] += 1
                word_vectors[word_j_idx, word_i_idx] += 1

    sentence_vectors = np.zeros((len(sentences), len(word_set)))
    for i, sentence in enumerate(sentences_tokenized):
        for word in sentence:
            sentence_vectors[i, word_to_idx[word]] += 1

    similarity_matrix = cosine_similarity(sentence_vectors)

    # summarization
    def summarization(similarity_matrix, d=0.85, max_iter=100):
        scores = np.ones(len(sentences))
        for i in range(max_iter):
            scores = (1-d) + d*np.dot(similarity_matrix.T, scores)
        return scores

    scores = summarization(similarity_matrix)
    ranked_sentences = sorted(((score, i) for i, score in enumerate(scores)), reverse=True)

    # 요약문 출력 (대화문 중 핵심이 되는 대화 2문장 출력)
    num_sentences = notbadlen
    summary_sentences = sorted(ranked_sentences[:num_sentences])
    summary = ' '.join([sentences[i] for _, i in summary_sentences])
    return summary

In [None]:
from datasets import *
from tqdm import tqdm


# For all
space = 'Ġ'
pre_quote = '’'
end_marks = ['.', ',', '?', '!', '...']
quotes = ['"', '\'']
abbreviations = ['s', 'd', 't', 'm', 're', 'll', 've', 'S', 'D', 'T', 'M', 'Re', 'Ll', 'Ve']

# For empathetic dialogues
exclude_symbol = "_conv"
comma_symbol = "_comma_"

# For persona chat
persona_chat_url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
silence_symbol = "__ SILENCE __"


def load_daily():
    dataset = load_dataset('daily_dialog')
    test_dialogues = dataset['test']['dialog']
    
    return test_dialogues
    
    

def process_token_list(token_list):
    token_list[0] = token_list[0].capitalize()
    
    quote_count = 0
    for i, token in enumerate(token_list):
        if space in token:
            if token[1:] in end_marks or token[1:] in abbreviations:
                token_list[i] = token[1:]
                
            if token[1:] == quotes[1]:
                if i<len(token_list)-1:
                    if token_list[i+1] in abbreviations or (token_list[i+1][0] == space and token_list[i+1][1:] in abbreviations):
                        token_list[i] = token[1:]
                        
        if token[0] == space and token[1:] in quotes:
            if quote_count % 2 == 1:
                token_list[i] = token[1:]
                quote_count = 0
            else:
                if i<len(token_list)-1 and token_list[i+1][0] == space:
                    token_list[i+1] = token_list[i+1][1:]
                quote_count += 1
                
        if token in end_marks or token[1:] in end_marks:
            if i<len(token_list)-1:
                if token_list[i+1][0] != space:
                    token_list[i+1] = space + token_list[i+1].capitalize()
                else:
                    token_list[i+1] = space + token_list[i+1][1:].capitalize()
                
    new_token_list = [token for token in token_list if token != space and len(token)>0]
    if new_token_list[-1] not in end_marks:
        new_token_list.append(end_marks[0])
        
    return new_token_list


In [None]:
test_arr = load_daily()
load_daily()

In [None]:
# utters= ["hello" , "my" , "name" , "is" , "susan" , "thank" , "you" , "for" , "listening"] 

# print(' '.join(utters[:-2]))

In [None]:
import difflib
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_polynomial_decay_schedule_with_warmup

from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter
from itertools import chain

import torch
import os, sys
import numpy as np
import argparse
import copy
import math
import random
import json

class Arguments:
    def __init__(self):
        self.seed = 0 
        self.mode="test" 
        self.data_dir="data" 
        self.model_type="gpt2" 
        self.bos_token="<bos>" 
        self.sp1_token="<sp1>" 
        self.sp2_token="<sp2>" 
        self.gpu="0" 
        self.max_len=1024 
        self.max_turns=5 
        self.top_p=0.8 
        self.ckpt_dir="saved_models" 
        self.ckpt_name="best_ckpt_epoch=3_valid_loss=2.6631" 
        self.end_command="Abort!"



#원래 shell로 들어가는 파라미터를 정의합니다.

class Manager():
    def __init__(self, args, test_arr):
        self.args = args
        self.test_arr = test_arr

        if torch.cuda.is_available():
            self.args.device = torch.device(f"cuda:{self.args.gpu}")
        else:
            self.args.device = torch.device("cpu")
        
        # Tokenizer & Vocab
        print("Loading the tokenizer...")
        self.tokenizer = GPT2Tokenizer.from_pretrained(self.args.model_type)
        special_tokens = {
            'bos_token': self.args.bos_token,
            'additional_special_tokens': [self.args.sp1_token, self.args.sp2_token]
        }
        self.args.eos_token = self.tokenizer.eos_token
        num_new_tokens = self.tokenizer.add_special_tokens(special_tokens)
        vocab = self.tokenizer.get_vocab()
        self.args.vocab_size = len(vocab)
        self.args.bos_id = vocab[self.args.bos_token]
        self.args.eos_id = vocab[self.args.eos_token]
        self.args.sp1_id = vocab[self.args.sp1_token]
        self.args.sp2_id = vocab[self.args.sp2_token]
        
        # Load model    
        print("Loading the model...")
        self.fix_seed(self.args.seed)
        self.model = GPT2LMHeadModel.from_pretrained(self.args.model_type).to(self.args.device)
        self.model.resize_token_embeddings(self.args.vocab_size)
        
        self.args.max_len = min(self.args.max_len, self.model.config.n_ctx)
            
        
        
        if self.args.ckpt_name is not None:
            ckpt_path = f"{self.args.ckpt_dir}/{self.args.ckpt_name}.ckpt"
            if os.path.exists(ckpt_path):
                print("Loading the trained checkpoint...")
                ckpt = torch.load(ckpt_path, map_location=self.args.device)
                self.model.load_state_dict(ckpt['model_state_dict'])
                
                if self.args.mode == 'train':
                    print(f"The training restarts with the specified checkpoint: {self.args.ckpt_name}.ckpt.")
                    self.optim.load_state_dict(ckpt['optim_state_dict'])
                    self.sched.load_state_dict(ckpt['sched_state_dict'])
                    self.best_loss = ckpt['loss']
                    self.last_epoch = ckpt['epoch']
                else:
                    print("The inference will start with the specified checkpoint.")
            else:
                print(f"Cannot fine the specified checkpoint {ckpt_path}.")
                if self.args.mode == 'train':
                    print("Training will start with the initialized model.")
                else:
                    print("Cannot inference.")
                    exit()
              
        print("Setting finished.")
        
    def nucleus_sampling(self, input_ids, token_type_ids, input_len):
        output_ids = []
        for pos in range(input_len, self.args.max_len):
            output = self.model(input_ids=input_ids, token_type_ids=token_type_ids)[0][:, pos-1]  # (1, V)
            output = F.softmax(output, dim=-1)  # (1, V)
            
            sorted_probs, sorted_idxs = torch.sort(output, descending=True)
            cumsum_probs = torch.cumsum(sorted_probs, dim=-1)  # (1, V)
            idx_remove = cumsum_probs > self.args.top_p
            idx_remove[:, 1:] = idx_remove[:, :-1].clone()
            idx_remove[:, 0] = False
            sorted_probs[idx_remove] = 0.0
            sorted_probs /= torch.sum(sorted_probs, dim=-1, keepdim=True)  # (1, V)
            
            probs = torch.zeros(output.shape, device=self.args.device).scatter_(-1, sorted_idxs, sorted_probs)  # (1, V)
            idx = torch.multinomial(probs, 1)  # (1, 1)
            
            idx_item = idx.squeeze(-1).squeeze(-1).item()
            output_ids.append(idx_item)
            
            if idx_item == self.args.eos_id:
                break
                
            input_ids = torch.cat((input_ids, idx), dim=-1)
            next_type_id = torch.LongTensor([[self.args.sp2_id]]).to(self.args.device)
            token_type_ids = torch.cat((token_type_ids, next_type_id), dim=-1)
            assert input_ids.shape == token_type_ids.shape
            
        return output_ids

    def fix_seed(self, seed):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)  

    def test(self):
        test_arr = self.test_arr
        print("Let's start!")
        self.model.eval()
        self.fix_seed(self.args.seed)

        summarized_utter = []
        user = []
        ground_truth = []
        for utters in test_arr:

            #summarized_utter.append(summarizer(utters[:-2])[0]["summary_text"]) #요약문장
            user.append(utters[-2]) #user utter
            ground_truth.append(utters[-1]) #ground truth 진짜 정답.

        
        with torch.no_grad():
            input_hists = []
            ex_cnt = -1
            similarity = 0
            context_for_json = []
            ans_for_json = []
            for utters in test_arr:
                #utter = input("You: ")
                ex_cnt += 1
                if ex_cnt == len(test_arr):
                    break
                
                ## summarized input ##
                input_hists = []
                if len(utters) >= 3:
                    string_input = ' '.join(utters[:-2])
                    sumrz_input = textrank(string_input)
                else:
                    string_input = ""
                    sumrz_input = ""    
                summarized_utter.append(sumrz_input)

                context_for_json.append(string_input) #chatgpt classification

                sumrz_input_ids = [self.args.sp1_id] + self.tokenizer.encode(sumrz_input) # sp1 sumar.. utter
                input_hists.append(sumrz_input_ids)

                ## sumarized utter를 하나씩 꺼냅니다. ##

                input_ids = [self.args.sp2_id] + self.tokenizer.encode(utters[-2]) # sp2 user utter
                input_hists.append(input_ids)
                
                #if len(input_hists) >= self.args.max_turns:
                    #num_exceeded = len(input_hists) - self.args.max_turns + 1
                    #input_hists = input_hists[num_exceeded:]
                # 역시 턴 개념은 사용하지 않습니다.
                    
                input_ids = [self.args.bos_id] + list(chain.from_iterable(input_hists)) + [self.args.sp1_id] # 2 -> 1
                #start_sp_id = input_hists[0][0]
                start_sp_id = self.args.sp1_id
                


                #next_sp_id = self.args.sp1_id if start_sp_id == self.args.sp2_id else self.args.sp2_id
                next_sp_id = self.args.sp2_id

                assert start_sp_id != next_sp_id
                token_type_ids = [[start_sp_id] * len(hist) if h % 2 == 0 else [next_sp_id] * len(hist) for h, hist in enumerate(input_hists)] 
                assert len(token_type_ids) == len(input_hists)
                token_type_ids = [start_sp_id] + list(chain.from_iterable(token_type_ids)) + [self.args.sp1_id] # 2 -> 1
                assert len(input_ids) == len(token_type_ids)
                input_len = len(input_ids)
                
                input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(self.args.device)
                token_type_ids = torch.LongTensor(token_type_ids).unsqueeze(0).to(self.args.device)
                
                output_ids = self.nucleus_sampling(input_ids, token_type_ids, input_len)
           
                # output_ids = self.model.generate(
                #     input_ids=input_ids, token_type_ids=token_type_ids, pad_token_id=self.args.eos_id,
                #     do_sample=True, top_p=self.args.top_p, max_length=self.args.max_len,
                #     output_hidden_states=True, output_scores=True, return_dict_in_generate=True,
                # ).sequences
                # output_ids = output_ids[0].tolist()[input_len:]
                res = self.tokenizer.decode(output_ids, skip_special_tokens=True)

                similarity += compute_similarity(res, ground_truth[ex_cnt])

                print(f"summarized : {summarized_utter[ex_cnt]}")
                print(f"user : {user[ex_cnt]}")
                print(f"res : {res}\n gt : {ground_truth[ex_cnt]}")

                # 예측한 문장과 ground truth를 비교할 수 있습니다.
                # 아직 눈으로 밖에 비교할 방법이 없음.

                #print(f"Bot: {res}")
                #input_hists.append([self.args.sp2_id] + self.tokenizer.encode(res))

                context_for_json[-1] += user[ex_cnt]
                ans_for_json.append(res)

                assert len(context_for_json) == len(ans_for_json)
                if len(context_for_json) == 600:
                    # 데이터 딕셔너리 생성
                    data = {"context": context_for_json, "ans": ans_for_json}

                    # JSON 파일로 저장
                    with open("test/json/textrank_3e.json", "w") as f:
                        json.dump(data, f)
                    break
                

        print(f"문자열 간에 유사도 : {similarity / len(test_arr)}")


def compute_similarity(string1, string2):

    matcher = difflib.SequenceMatcher(None, string1, string2)
    return matcher.ratio()

# 두 문자열을 비교하는 함수이나...
# 큰 의미는 없는것 같다.

      

In [None]:
args = Arguments()
args.ckpt_dir = f"{args.ckpt_dir}/{args.model_type}"
assert args.ckpt_name is not None, "Please specify the trained model checkpoint."
manager = Manager(args, test_arr)
manager.test()

Loading the tokenizer...
Loading the model...
Loading the trained checkpoint...
The inference will start with the specified checkpoint.
Setting finished.
Let's start!
summarized : Some what ? Sounds good ! Pot , Ganja , Mary Jane some chronic ! You know ? I also have blow if you prefer to do a few lines . Try some ! Where do you get them from ? Oh , umm , no thanks . Let ’ s see , I want . Just tell me what you want and I ’ ll even give you one ounce for free . I got my connections ! I even got dope and acid ! Come on man ! Weed ! No , I am ok , really . Do you really have all of these drugs ? Hey man , you wanna buy some weed ?
user :  Yeah ? 
res : Of course I have. The rest is out of my head. I only ask that you bring me one ounce of heroin.
 gt :  I want you to put your hands behind your head ! You are under arrest ! 
summarized : What for ? The taxi drivers are on strike again .
user :  They want the government to reduce the price of the gasoline . 
res : You know, These days, Eve

KeyboardInterrupt: ignored

## valid_utters로 테스트 요약모델은 bart-large...

In [None]:
from datasets import *
from tqdm import tqdm


# For all
space = 'Ġ'
pre_quote = '’'
end_marks = ['.', ',', '?', '!', '...']
quotes = ['"', '\'']
abbreviations = ['s', 'd', 't', 'm', 're', 'll', 've', 'S', 'D', 'T', 'M', 'Re', 'Ll', 'Ve']

# For empathetic dialogues
exclude_symbol = "_conv"
comma_symbol = "_comma_"

# For persona chat
persona_chat_url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
silence_symbol = "__ SILENCE __"


def load_daily(tokenizer, train_frac):
    dataset = load_dataset('daily_dialog')
    train_dialogues = dataset['train']['dialog']
    valid_dialogues = dataset['validation']['dialog']
    test_dialogues = dataset['test']['dialog']
    
    total_dialogues = train_dialogues + valid_dialogues + test_dialogues
    
    for i, dialogue in enumerate(tqdm(total_dialogues)):
        new_dialogue = []
        for utter in dialogue:
            token_list = tokenizer.tokenize(utter.strip().replace(pre_quote, quotes[1]))
            token_list = process_token_list(token_list)
            text = tokenizer.convert_tokens_to_string(token_list)
            new_dialogue.append(text)
            
        total_dialogues[i] = new_dialogue
    
    train_utter_num = 0
    valid_utter_num = 0
    train_dialogues = total_dialogues[:int(len(total_dialogues)*train_frac)]
    valid_dialogues = total_dialogues[int(len(total_dialogues)*train_frac):]
    
    for dialogue in train_dialogues:
        train_utter_num += len(dialogue)
        
    for dialogue in valid_dialogues:
        valid_utter_num += len(dialogue)
    
    return train_dialogues, valid_dialogues, train_utter_num, valid_utter_num
    
    

def process_token_list(token_list):
    token_list[0] = token_list[0].capitalize()
    
    quote_count = 0
    for i, token in enumerate(token_list):
        if space in token:
            if token[1:] in end_marks or token[1:] in abbreviations:
                token_list[i] = token[1:]
                
            if token[1:] == quotes[1]:
                if i<len(token_list)-1:
                    if token_list[i+1] in abbreviations or (token_list[i+1][0] == space and token_list[i+1][1:] in abbreviations):
                        token_list[i] = token[1:]
                        
        if token[0] == space and token[1:] in quotes:
            if quote_count % 2 == 1:
                token_list[i] = token[1:]
                quote_count = 0
            else:
                if i<len(token_list)-1 and token_list[i+1][0] == space:
                    token_list[i+1] = token_list[i+1][1:]
                quote_count += 1
                
        if token in end_marks or token[1:] in end_marks:
            if i<len(token_list)-1:
                if token_list[i+1][0] != space:
                    token_list[i+1] = space + token_list[i+1].capitalize()
                else:
                    token_list[i+1] = space + token_list[i+1][1:].capitalize()
                
    new_token_list = [token for token in token_list if token != space and len(token)>0]
    if new_token_list[-1] not in end_marks:
        new_token_list.append(end_marks[0])
        
    return new_token_list


In [None]:
import difflib
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_polynomial_decay_schedule_with_warmup

from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter
from itertools import chain

import torch
import os, sys
import numpy as np
import argparse
import copy
import math
import random

class Arguments:
    def __init__(self):
        self.seed = 0 
        self.mode="test" 
        self.data_dir="data" 
        self.model_type="gpt2" 
        self.bos_token="<bos>" 
        self.sp1_token="<sp1>" 
        self.sp2_token="<sp2>" 
        self.gpu="0" 
        self.max_len=1024 
        self.max_turns=5 
        self.top_p=0.8 
        self.ckpt_dir="saved_models" 
        self.ckpt_name="best_ckpt_epoch=3_valid_loss=2.6631" 
        self.end_command="Abort!"

#원래 shell로 들어가는 파라미터를 정의합니다.

class Manager():
    def __init__(self, args):
        self.args = args
        
        if torch.cuda.is_available():
            self.args.device = torch.device(f"cuda:{self.args.gpu}")
        else:
            self.args.device = torch.device("cpu")
        
        # Tokenizer & Vocab
        print("Loading the tokenizer...")
        self.tokenizer = GPT2Tokenizer.from_pretrained(self.args.model_type)
        special_tokens = {
            'bos_token': self.args.bos_token,
            'additional_special_tokens': [self.args.sp1_token, self.args.sp2_token]
        }
        self.args.eos_token = self.tokenizer.eos_token
        num_new_tokens = self.tokenizer.add_special_tokens(special_tokens)
        vocab = self.tokenizer.get_vocab()
        self.args.vocab_size = len(vocab)
        self.args.bos_id = vocab[self.args.bos_token]
        self.args.eos_id = vocab[self.args.eos_token]
        self.args.sp1_id = vocab[self.args.sp1_token]
        self.args.sp2_id = vocab[self.args.sp2_token]
        
        # Load model    
        print("Loading the model...")
        self.fix_seed(self.args.seed)
        self.model = GPT2LMHeadModel.from_pretrained(self.args.model_type).to(self.args.device)
        self.model.resize_token_embeddings(self.args.vocab_size)
        
        self.args.max_len = min(self.args.max_len, self.model.config.n_ctx)
            
        
        
        if self.args.ckpt_name is not None:
            ckpt_path = f"{self.args.ckpt_dir}/{self.args.ckpt_name}.ckpt"
            if os.path.exists(ckpt_path):
                print("Loading the trained checkpoint...")
                ckpt = torch.load(ckpt_path, map_location=self.args.device)
                self.model.load_state_dict(ckpt['model_state_dict'])
                
                if self.args.mode == 'train':
                    print(f"The training restarts with the specified checkpoint: {self.args.ckpt_name}.ckpt.")
                    self.optim.load_state_dict(ckpt['optim_state_dict'])
                    self.sched.load_state_dict(ckpt['sched_state_dict'])
                    self.best_loss = ckpt['loss']
                    self.last_epoch = ckpt['epoch']
                else:
                    print("The inference will start with the specified checkpoint.")
            else:
                print(f"Cannot fine the specified checkpoint {ckpt_path}.")
                if self.args.mode == 'train':
                    print("Training will start with the initialized model.")
                else:
                    print("Cannot inference.")
                    exit()
              
        print("Setting finished.")
        
    def nucleus_sampling(self, input_ids, token_type_ids, input_len):
        output_ids = []
        for pos in range(input_len, self.args.max_len):
            output = self.model(input_ids=input_ids, token_type_ids=token_type_ids)[0][:, pos-1]  # (1, V)
            output = F.softmax(output, dim=-1)  # (1, V)
            
            sorted_probs, sorted_idxs = torch.sort(output, descending=True)
            cumsum_probs = torch.cumsum(sorted_probs, dim=-1)  # (1, V)
            idx_remove = cumsum_probs > self.args.top_p
            idx_remove[:, 1:] = idx_remove[:, :-1].clone()
            idx_remove[:, 0] = False
            sorted_probs[idx_remove] = 0.0
            sorted_probs /= torch.sum(sorted_probs, dim=-1, keepdim=True)  # (1, V)
            
            probs = torch.zeros(output.shape, device=self.args.device).scatter_(-1, sorted_idxs, sorted_probs)  # (1, V)
            idx = torch.multinomial(probs, 1)  # (1, 1)
            
            idx_item = idx.squeeze(-1).squeeze(-1).item()
            output_ids.append(idx_item)
            
            if idx_item == self.args.eos_id:
                break
                
            input_ids = torch.cat((input_ids, idx), dim=-1)
            next_type_id = torch.LongTensor([[self.args.sp2_id]]).to(self.args.device)
            token_type_ids = torch.cat((token_type_ids, next_type_id), dim=-1)
            assert input_ids.shape == token_type_ids.shape
            
        return output_ids

    def fix_seed(self, seed):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)  

    def test(self):
        print("Let's start!")
        self.model.eval()
        self.fix_seed(self.args.seed)
        
        import json
        if not os.path.isdir(args.data_dir):
            os.makedirs(args.data_dir)
        with open(f"{args.data_dir}/valid_utters.json") as f:
            json_obj = json.load(f)

        #valid set을 사용하겠습니다.

        summarized_utter = []
        user = []
        ground_truth = []
        for utters in json_obj:
            summarized_utter.append(utters[0]) #요약문장
            user.append(utters[1]) #user utter
            ground_truth.append(utters[2]) #ground truth 진짜 정답.

        
        with torch.no_grad():
            input_hists = []
            ex_cnt = -1
            similarity = 0
            while True:
                #utter = input("You: ")
                ex_cnt += 1
                if ex_cnt == len(json_obj):
                    break


                utter = user[ex_cnt]
                

                if utter == self.args.end_command:
                    print("Bot: Good bye.")
                    break
                
                ## summarized input ##
                input_hists = []
                sumrz_input = summarized_utter[ex_cnt]
                sumrz_input_ids = [self.args.sp1_id] + self.tokenizer.encode(sumrz_input) # sp1 sumar.. utter
                input_hists.append(sumrz_input_ids)
                ## sumarized utter를 하나씩 꺼냅니다. ##

                input_ids = [self.args.sp2_id] + self.tokenizer.encode(utter) # sp2 user utter
                input_hists.append(input_ids)
                
                #if len(input_hists) >= self.args.max_turns:
                    #num_exceeded = len(input_hists) - self.args.max_turns + 1
                    #input_hists = input_hists[num_exceeded:]
                # 역시 턴 개념은 사용하지 않습니다.
                    
                input_ids = [self.args.bos_id] + list(chain.from_iterable(input_hists)) + [self.args.sp1_id] # 2 -> 1
                #start_sp_id = input_hists[0][0]
                start_sp_id = self.args.sp1_id
                


                #next_sp_id = self.args.sp1_id if start_sp_id == self.args.sp2_id else self.args.sp2_id
                next_sp_id = self.args.sp2_id

                assert start_sp_id != next_sp_id
                token_type_ids = [[start_sp_id] * len(hist) if h % 2 == 0 else [next_sp_id] * len(hist) for h, hist in enumerate(input_hists)] 
                assert len(token_type_ids) == len(input_hists)
                token_type_ids = [start_sp_id] + list(chain.from_iterable(token_type_ids)) + [self.args.sp1_id] # 2 -> 1
                assert len(input_ids) == len(token_type_ids)
                input_len = len(input_ids)
                
                input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(self.args.device)
                token_type_ids = torch.LongTensor(token_type_ids).unsqueeze(0).to(self.args.device)
                
                output_ids = self.nucleus_sampling(input_ids, token_type_ids, input_len)
           
                # output_ids = self.model.generate(
                #     input_ids=input_ids, token_type_ids=token_type_ids, pad_token_id=self.args.eos_id,
                #     do_sample=True, top_p=self.args.top_p, max_length=self.args.max_len,
                #     output_hidden_states=True, output_scores=True, return_dict_in_generate=True,
                # ).sequences
                # output_ids = output_ids[0].tolist()[input_len:]
                res = self.tokenizer.decode(output_ids, skip_special_tokens=True)

                similarity += compute_similarity(res, ground_truth[ex_cnt])

                print(f"summarized : {summarized_utter[ex_cnt]}")
                print(f"user : {user[ex_cnt]}")
                print(f"res : {res}\n gt : {ground_truth[ex_cnt]}")

                # 예측한 문장과 ground truth를 비교할 수 있습니다.
                # 아직 눈으로 밖에 비교할 방법이 없음.

                #print(f"Bot: {res}")
                #input_hists.append([self.args.sp2_id] + self.tokenizer.encode(res))
        print(f"문자열 간에 유사도 : {similarity / len(json_obj)}")


def compute_similarity(string1, string2):

    matcher = difflib.SequenceMatcher(None, string1, string2)
    return matcher.ratio()

# 두 문자열을 비교하는 함수이나...
# 큰 의미는 없는것 같다.

      

In [None]:
args = Arguments()
args.data_dir = f"{args.data_dir}/{args.model_type}"
args.ckpt_dir = f"{args.ckpt_dir}/{args.model_type}"
assert args.ckpt_name is not None, "Please specify the trained model checkpoint."
manager = Manager(args)
manager.test()