<a href="https://colab.research.google.com/github/heimmer/AutoField/blob/master/DialoGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
! pip -q install transformers

In [16]:
import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

# Load data

In [17]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [18]:
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/CS6493-Project-DialoGPT")

# Args

In [19]:
# Args to allow for easy convertion of python script to notebook
class Args():
    def __init__(self):
        self.output_dir = 'output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'
        self.data_dir = '/content/drive/MyDrive/Colab Notebooks/CS6493-Project-DialoGPT/dialogues_text.txt'
        self.test_size = 0.1
args = Args()

# tokenizer

In [20]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name ,padding_side='right')
model = AutoModelWithLMHead.from_pretrained(args.model_name_or_path)

# Data preprogress

In [21]:
def dataframe_prepare(data_dir = args.data_dir,test_size = args.test_size):
    '''Prepare the dataframe for training and validation'''
    dataset = []
    with open(data_dir, encoding='utf-8') as f:
        for line in f:
            dataset.append([i for i in line.strip('\n').split('__eou__') if i != '']) # remove empty string, and split by __eou__
    trn_df, val_df = train_test_split(dataset[:len(dataset)//10], test_size = 0.1) # run on smaller dataset for now
    return trn_df, val_df, dataset

trn_df, val_df, df = dataframe_prepare()


def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]) # no need to reverse, it's already in contextalized order
    conv = flatten(conv)
    return conv



class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        # df = dataframe_prepare(args.data_dir)

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:  # when cache exist and arg specify that we dont need to create new cache
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for row in df:
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)
    


In [22]:
type(val_df)
trn_df[1]
len(trn_df)

1179

# generation

In [32]:
def calculate_bleu(reference, generated, n=4):
    reference_tokens = [tokenizer(reference)]
    print(reference_tokens)
    generated_tokens = tokenizer(generated)
    print(generated_tokens)

    weights = [1/n] * n
    smoothing_function = SmoothingFunction().method1
    bleu_score = sentence_bleu(reference_tokens, generated_tokens, weights=weights, smoothing_function=smoothing_function)
    return bleu_score

In [33]:
reference = "I'll throw out the garbage"
output_text = "I'm not sure what you're talking about."
b1 = calculate_bleu(reference, output_text, n=1)
b1

[{'input_ids': [40, 1183, 3714, 503, 262, 15413], 'attention_mask': [1, 1, 1, 1, 1, 1]}]
{'input_ids': [40, 1101, 407, 1654, 644, 345, 821, 3375, 546, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


1.0

In [25]:
ref = [[]]
predict = [[]]
for dia in range(len(df)):
  dialogue = df[dia]
  for i in range(len(dialogue)-1): # n sentences only contain n-1 response
    input = tokenizer.encode(dialogue[i] + tokenizer.eos_token, return_tensors='pt')
    # print('input:',tokenizer.decode(input[0],skip_special_tokens=True))
    history = torch.cat([history, input], dim=-1) if i > 0 else input
    print('input:',tokenizer.decode(history[0],skip_special_tokens=True))
    history_generate = model.generate(
                      history, max_length=1000,
                      pad_token_id=tokenizer.eos_token_id
                      )
    # print('history+generate:',tokenizer.decode(history_generate[0],skip_special_tokens=True))
    # print("generate: {}".format(tokenizer.decode(history_generate[:, history.shape[-1]:] [0], skip_special_tokens=True)))

    #save the generated sentence and it's reference
    output = history_generate[:, history.shape[-1]:][0]
    output_text = tokenizer.decode(output, skip_special_tokens=True)
    print('prediction:',output_text)
    reference = dialogue[i+1]
    print('reference:',reference)
    # ref[dia].append(input.tolist())
    bleu_1 = calculate_bleu(reference, output_text, n=1)
    bleu_2 = calculate_bleu(reference, output_text, n=2)
    print('bleu_1:',bleu_1)
    print('bleu_2:',bleu_2)
    print('----------------------------------------')


        


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


input: The kitchen stinks. 


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


prediction: I'm not sure what you're talking about.
reference:  I'll throw out the garbage . 
bleu_1: 1.0
bleu_2: 1.0
----------------------------------------
input: So Dick, how about getting some coffee for tonight? 


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


prediction: I'm not sure if you're being sarcastic or not.
reference:  Coffee ? I don ’ t honestly like that kind of stuff . 
bleu_1: 1.0
bleu_2: 1.0
----------------------------------------
input: So Dick, how about getting some coffee for tonight?  Coffee? I don ’ t honestly like that kind of stuff. 


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


prediction: I'm going to go with coffee.
reference:  Come on , you can at least try a little , besides your cigarette . 
bleu_1: 1.0
bleu_2: 1.0
----------------------------------------
input: So Dick, how about getting some coffee for tonight?  Coffee? I don ’ t honestly like that kind of stuff.  Come on, you can at least try a little, besides your cigarette. 


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


prediction: I'm not sure what to do with my hands.
reference:  What ’ s wrong with that ? Cigarette is the thing I go crazy for . 
bleu_1: 1.0
bleu_2: 1.0
----------------------------------------
input: So Dick, how about getting some coffee for tonight?  Coffee? I don ’ t honestly like that kind of stuff.  Come on, you can at least try a little, besides your cigarette.  What ’ s wrong with that? Cigarette is the thing I go crazy for. 


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


prediction: I'm not sure what you're talking about.
reference:  Not for me , Dick . 
bleu_1: 1.0
bleu_2: 1.0
----------------------------------------
input: Are things still going badly with your houseguest? 


KeyboardInterrupt: ignored

# Evaluation

一点背景知识  

- 使用BLEU（Bilingual Evaluation Understudy）和ROUGE（Recall-Oriented Understudy for Gisting Evaluation）等自然语言处理指标来度量生成的文本与参考文本之间的相似性。

- 计算Distinct值度量生成的回复的多样性。Distinct值的计算方法是统计生成的回复中不同短语的比例。这个指标越高，代表生成的回复越有多样性。

因为是多轮对话，做评估的时候，要把eg五轮产生的回答放在一起

In [84]:
data = {
  "dialogue_id": "1",
  "turns": [
    {
      "turn_id": "1",
      "ref": "Hello! How are you doing today?",
      "translation": "你好！你今天过得如何？"
    },
    {
      "turn_id": "2",
      "ref": "I'm doing pretty well, thanks for asking. How about you?",
      "translation": "我过得还挺好的，谢谢你。你呢？"
    },
    {
      "turn_id": "3",
      "ref": "I'm doing okay. What are you up to today?",
      "translation": "我还好，你今天有什么安排？"
    }
  ]
}

from nltk.translate.bleu_score import sentence_bleu

# 获取所有参考翻译
refs = [[turn['ref'].split()] for turn in data['turns']]
# 获取所有机器翻译结果
hypotheses = [turn['translation'].split() for turn in data['turns']]

# 计算 BLEU 分数
score = corpus_bleu(refs, hypotheses)
print(refs)
print(hypotheses)


[[['Hello!', 'How', 'are', 'you', 'doing', 'today?']], [["I'm", 'doing', 'pretty', 'well,', 'thanks', 'for', 'asking.', 'How', 'about', 'you?']], [["I'm", 'doing', 'okay.', 'What', 'are', 'you', 'up', 'to', 'today?']]]
[['你好！你今天过得如何？'], ['我过得还挺好的，谢谢你。你呢？'], ['我还好，你今天有什么安排？']]


In [91]:
# 测试多轮中，后产生的答案，能不能参考之前的context
answer = [[['Hello!', 'How', 'are','hahaha']], 
      [["I'm", 'doing', 'pretty', 'well,', 'thanks', 'for', 'asking.', 'How', 'about', 'you?']], 
      [["I'm", 'doing', 'okay.', 'What', 'are', 'you', 'up', 'to', 'today?']]
      ]
pre = [['How'], ['doing'], ['doing']]
a = corpus_bleu(answer, pre, weights=(1, 0,0,0))
a

0.0012726338013398079

In [97]:
# 测试多轮中，后产生的答案，能不能参考之前的context
answer = [[['Hello!', 'How', 'are','hahaha']] 
      ]
pre = [['How','are','hahaha']]
a = corpus_bleu(answer, pre, weights=(1, 0,0,0))
a

0.7165313105737893

In [14]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [61]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 KB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.4


In [32]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

In [46]:
reference = ['I', 'love', 'to', 'play', 'soccer']
generated = ['I', 'like', 'playing', 'soccer']


In [59]:
from nltk.translate.bleu_score import sentence_bleu
reference = [['this', 'is', 'small', 'test']]
candidate = ['this', 'is', 'a', 'test']
score1 = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
score2 = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0))
score3 = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0))
score4 = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))
print(score1)
print(score2)
print(score3)
print(score4)

0.75
0.49999999999999994
1.8877473323743118e-102
1.0547686614863434e-154


In [74]:
from torchmetrics.functional import bleu_score
preds = ['cat is on the mat']
target = [['there is a cat on the mat', 'a cat is on the mat']]
bleu_score(preds, target,weights=[1,0,0,0])


tensor(0.8187)

In [81]:
######################################
import nltk
from nltk.translate.bleu_score import corpus_bleu

# 模型翻译结果
hypotheses = [['The', 'cat', 'is', 'on', 'the', 'mat']]
# 参考翻译结果
references = [[['The', 'cat', 'is', 'sitting', 'on', 'the', 'mat'],
               ['The', 'cat', 'is', 'on', 'the', 'rug']]]

# 计算BLEU分数
score = corpus_bleu(references, hypotheses,weights=(0.5, 0.5,0,0))
print(score)


1.0


In [37]:
rouge = Rouge()
rouge_score = rouge.get_scores(' '.join(generated), ' '.join(reference), avg=True)
print("ROUGE score:", rouge_score)


TypeError: ignored

In [19]:
unique_phrases = set(generated)
distinct_score = len(unique_phrases) / len(generated)
print("Distinct score:", distinct_score)


Distinct score: 1.0
