<a href="https://colab.research.google.com/github/heimmer/AutoField/blob/master/DialoGPT_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
! pip -q install transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m100.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
! pip -q install rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [5]:
import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize




from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [6]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [7]:
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/CS6493-Project-DialoGPT")

# Args

In [27]:
size = 'large'
# Args to allow for easy convertion of python script to notebook
class Args():
    def __init__(self):
        self.output_dir = 'output-{}'.format(size)
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-{}'.format(size)
        self.config_name = 'microsoft/DialoGPT-{}'.format(size)
        self.tokenizer_name = 'microsoft/DialoGPT-{}'.format(size)
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'
        self.data_dir = '/content/drive/MyDrive/Colab Notebooks/CS6493-Project-DialoGPT/dialogues_text.txt'
        self.test_size = 0.1
args = Args()

# tokenizer

In [20]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name ,padding_side='right')
model = AutoModelWithLMHead.from_pretrained(args.model_name_or_path)
model.to("cuda:0" if torch.cuda.is_available() else "cpu")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

# Data load and preprogress

In [10]:
def dataframe_prepare(data_dir = args.data_dir,test_size = args.test_size):
    '''Prepare the dataframe for training and validation'''
    dataset = []
    with open(data_dir, encoding='utf-8') as f:
        for line in f:
            dataset.append([i for i in line.strip('\n').split('__eou__') if i != '']) # remove empty string, and split by __eou__
    trn_df, val_df = train_test_split(dataset[:len(dataset)//10], test_size = 0.1) # run on smaller dataset for now
    return trn_df, val_df, dataset

trn_df, val_df, df = dataframe_prepare()


def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]) # no need to reverse, it's already in contextalized order
    conv = flatten(conv)
    return conv



class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        # df = dataframe_prepare(args.data_dir)

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:  # when cache exist and arg specify that we dont need to create new cache
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for row in df:
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)
    


In [11]:
type(val_df)
trn_df[1]
len(df)

13118

# Evaluation Function

一点背景知识  

- 使用BLEU（Bilingual Evaluation Understudy）和ROUGE（Recall-Oriented Understudy for Gisting Evaluation）等自然语言处理指标来度量生成的文本与参考文本之间的相似性。

- 计算Distinct值度量生成的回复的多样性。Distinct值的计算方法是统计生成的回复中不同短语的比例。这个指标越高，代表生成的回复越有多样性。

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
def tokenize(text):
    return word_tokenize(text.lower())

def calculate_bleu(reference, generated, n=4):
    reference_tokens = [tokenize(reference)]
    generated_tokens = tokenize(generated)
    weights = [1/n] * n
    smoothing_function = SmoothingFunction().method1
    bleu_score = sentence_bleu(reference_tokens, generated_tokens, weights=weights, smoothing_function=smoothing_function)
    return bleu_score

def calculate_distinct_n(tokens, n=1):
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(tuple(tokens[i:i+n]))
    return len(set(ngrams)) / len(ngrams)

def calculate_distinct(tokens):
    distinct_1 = calculate_distinct_n(tokens, n=1)
    distinct_2 = calculate_distinct_n(tokens, n=2)
    return distinct_1, distinct_2


def calculate_rouge(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    rouge_1 = scores['rouge1'].fmeasure
    rouge_2 = scores['rouge2'].fmeasure
    rouge_l = scores['rougeL'].fmeasure
    return rouge_1, rouge_2, rouge_l


    

def calculate_bleu(reference, generated, n=4):
    reference_tokens = [tokenizer(reference)["input_ids"]]
    # print(reference_tokens)
    generated_tokens = tokenizer(generated)["input_ids"]
    # print(generated_tokens)

    weights = [1/n] * n
    smoothing_function = SmoothingFunction().method1
    bleu_score = sentence_bleu(reference_tokens, generated_tokens, weights=weights, smoothing_function=smoothing_function)
    return bleu_score

# Evaluation

In [None]:
import time
start_time = time.time()

import os
os.environ["PYTHONWARNINGS"] = "ignore::UserWarning:transformers.tokenization_utils_base"


total_bleu_1, total_bleu_2, total_bleu_4 = 0,0,0
total_rouge_1, total_rouge_2, total_rouge_l = 0,0,0
total_distinct_1, total_distinct_2 = 0,0
df_len = len(df)
print('Model:',args.model_name_or_path)
model.eval()
for dia in tqdm(range(df_len)):
  dialogue = df[dia]
  dia_bleu_1,dia_bleu_2,dia_bleu_4 = 0,0,0
  dia_rouge_1, dia_rouge_2, dia_rouge_l = 0,0,0 # 1 and L
  dia_len = len(dialogue)-1
  for i in range(dia_len): # n sentences only contain n-1 response
    input = tokenizer.encode(dialogue[i] + tokenizer.eos_token, return_tensors='pt')
    history = torch.cat([history, input], dim=-1) if i > 0 else input
    history_pt = history.to("cuda" if torch.cuda.is_available() else "cpu")
    # print('input:',tokenizer.decode(history[0],skip_special_tokens=True))


    history_generate = model.generate(
                      history_pt
                      , max_length=1000
                      ,pad_token_id=tokenizer.eos_token_id
                      )

    # print('history+generate:',tokenizer.decode(history_generate[0],skip_special_tokens=True))
    # print("generate: {}".format(tokenizer.decode(history_generate[:, history.shape[-1]:] [0], skip_special_tokens=True)))

    #save the generated sentence and it's reference
    output = history_generate[:, history.shape[-1]:][0]
    output_text = tokenizer.decode(output, skip_special_tokens=True)
    # print('prediction:',output_text)
    reference = dialogue[i+1]
    # print('reference:',reference)
    bleu_1 = calculate_bleu(reference, output_text, n=1)
    bleu_2 = calculate_bleu(reference, output_text, n=2)
    bleu_4 = calculate_bleu(reference, output_text, n=4)
    rouge_1, rouge_2, rouge_l = calculate_rouge(reference, output_text)

    dia_bleu_1 += bleu_1/dia_len
    dia_bleu_2 += bleu_2/dia_len
    dia_bleu_4 += bleu_4/dia_len
    dia_rouge_1 += rouge_1/dia_len
    dia_rouge_2 += rouge_2/dia_len
    dia_rouge_l += rouge_l/dia_len

    output_accum = (output_accum + output_text) if i>0 else output_text

  total_bleu_1 += dia_bleu_1/df_len
  total_bleu_2 += dia_bleu_2/df_len
  total_bleu_4 += dia_bleu_4/df_len
  total_rouge_1 += dia_rouge_1/df_len
  total_rouge_2 += dia_rouge_2/df_len
  total_rouge_l += dia_rouge_l/df_len
  dia_distinct_1, dia_distinct_2 = calculate_distinct(tokenize(output_accum))
  total_distinct_1 += dia_distinct_1/df_len
  total_distinct_2 += dia_distinct_2/df_len


  if dia%10==0 and dia>0: 
    end_time = time.time()
    print('{} dialogue finished, used {:.1f} min, current BLEU-1: {:.2f}%, BLEU-2: {:.2f}%, BLEU-4: {:.2f}%'.format(dia, (end_time-start_time)/60, total_bleu_1*df_len/dia*100, total_bleu_2*df_len/dia*100, total_bleu_4*df_len/dia*100) )
    print('ROUNGE-1: {:.2f}%, ROUNGE-2: {:.2f}%, ROUNGE-L: {:.2f}%, Distinct-1: {:.2f}%, Distinct-2: {:.2f}%'.format(total_rouge_1*df_len/dia*100, total_rouge_2*df_len/dia*100, total_rouge_l*df_len/dia*100, total_distinct_1*df_len/dia*100, total_distinct_2*df_len/dia*100) )
print('Model:',args.model_name_or_path)
print('{} dialogue finished, used {:.1f} min, current BLEU-1: {:.2f}%, BLEU-2: {:.2f}%, BLEU-4: {:.2f}%'.format(dia, (end_time-start_time)/60, total_bleu_1*df_len/dia*100, total_bleu_2*df_len/dia*100, total_bleu_4*df_len/dia*100) )
print('ROUNGE-1: {:.2f}%, ROUNGE-2: {:.2f}%, ROUNGE-L: {:.2f}%, Distinct-1: {:.2f}%, Distinct-2: {:.2f}%'.format(total_rouge_1*df_len/dia*100, total_rouge_2*df_len/dia*100, total_rouge_l*df_len/dia*100, total_distinct_1*df_len/dia*100, total_distinct_2*df_len/dia*100) )


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

3030 dialogue finished, used 78.2 min, current BLEU-1: 4.19%, BLEU-2: 1.68%, BLEU-4: 0.81%
ROUNGE-1: 10.98%, ROUNGE-2: 2.33%, ROUNGE-L: 10.38%, Distinct-1: 63.26%, Distinct-2: 82.34%


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

3040 dialogue finished, used 78.5 min, current BLEU-1: 4.19%, BLEU-2: 1.68%, BLEU-4: 0.81%
ROUNGE-1: 10.98%, ROUNGE-2: 2.33%, ROUNGE-L: 10.37%, Distinct-1: 63.23%, Distinct-2: 82.32%


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

3050 dialogue finished, used 78.8 min, current BLEU-1: 4.18%, BLEU-2: 1.68%, BLEU-4: 0.81%
ROUNGE-1: 10.97%, ROUNGE-2: 2.32%, ROUNGE-L: 10.36%, Distinct-1: 63.24%, Distinct-2: 82.33%


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

3060 dialogue finished, used 79.1 min, current BLEU-1: 4.18%, BLEU-2: 1.68%, BLEU-4: 0.81%
ROUNGE-1: 10.97%, ROUNGE-2: 2.33%, ROUNGE-L: 10.36%, Distinct-1: 63.22%, Distinct-2: 82.32%


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

3070 dialogue finished, used 79.4 min, current BLEU-1: 4.17%, BLEU-2: 1.68%, BLEU-4: 0.81%
ROUNGE-1: 10.96%, ROUNGE-2: 2.32%, ROUNGE-L: 10.35%, Distinct-1: 63.24%, Distinct-2: 82.34%


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

3080 dialogue finished, used 79.7 min, current BLEU-1: 4.17%, BLEU-2: 1.67%, BLEU-4: 0.81%
ROUNGE-1: 10.95%, ROUNGE-2: 2.32%, ROUNGE-L: 10.34%, Distinct-1: 63.24%, Distinct-2: 82.36%


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

3090 dialogue finished, used 80.0 min, current BLEU-1: 4.16%, BLEU-2: 1.67%, BLEU-4: 0.80%
ROUNGE-1: 10.94%, ROUNGE-2: 2.32%, ROUNGE-L: 10.33%, Distinct-1: 63.24%, Distinct-2: 82.35%


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

3100 dialogue finished, used 80.2 min, current BLEU-1: 4.16%, BLEU-2: 1.67%, BLEU-4: 0.80%
ROUNGE-1: 10.92%, ROUNGE-2: 2.31%, ROUNGE-L: 10.31%, Distinct-1: 63.24%, Distinct-2: 82.36%


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

In [23]:
print('{} dialogue finished, used {:.1f} min, current BLEU-1: {:.2f}%, BLEU-2: {:.2f}%, BLEU-4: {:.2f}%'.format(dia, (end_time-start_time)/60, total_bleu_1*df_len/dia*100, total_bleu_2*df_len/dia*100, total_bleu_4*df_len/dia*100) )
print('ROUNGE-1: {:.2f}%, ROUNGE-2: {:.2f}%, ROUNGE-L: {:.2f}%, Distinct-1: {:.2f}%, Distinct-2: {:.2f}%'.format(total_rouge_1*df_len/dia*100, total_rouge_2*df_len/dia*100, total_rouge_l*df_len/dia*100, total_distinct_1*df_len/dia*100, total_distinct_2*df_len/dia*100) )
print('Model:',args.model_name_or_path)

99 dialogue finished, used 1.5 min, current BLEU-1: 4.30%, BLEU-2: 1.79%, BLEU-4: 0.88%
ROUNGE-1: 10.33%, ROUNGE-2: 2.82%, ROUNGE-L: 9.84%, Distinct-1: 75.85%, Distinct-2: 90.14%
Model: microsoft/DialoGPT-medium
