## Imports

In [1]:
import math
import os
import pprint
import logging

import nltk
import numpy as np
import torch
from tqdm.auto import tqdm

import transformers
from accelerate import Accelerator
from filelock import FileLock
from transformers import AdamW, get_scheduler, set_seed

from transformers.file_utils import is_offline_mode
from transformers.utils.versions import require_version

from args import parse_args
from data_loader import raw_data_loader, data_processor
from model_loader import model_loader
from rouge_s import py_rouge_scores
from scoring import bleu_scores, meteor_scores
from utils import label_smoothed_nll_loss, postprocess_text

  from .autonotebook import tqdm as notebook_tqdm


## Prep Model

In [2]:
class Args:
    output_dir = "./output/run_mask_finetune_dialogsum500-1k_bart_base"
    train_file = "./data/samsum/train_small.csv"
    validation_file = "./data/samsum/val_small.csv"
    test_file = "./data/samsum/test_small.csv"
    text_column = "dialogue"
    summary_column = "summary"
#     model_name_or_path = "t5-base"
    model_name_or_path = "./output/run_mask_finetune_dialogsum500-1k_bart_base/best"
    model_type = "bart"
    source_prefix = ""
    max_source_length = 1024
    min_target_length = 1
    max_target_length = 128
    learning_rate = 5e-5
    weight_decay = 1e-3
    label_smoothing = 0.1
    length_penalty = 1.0
    num_train_epochs = 4
    per_device_train_batch_size = 1
    gradient_accumulation_steps = 16
    per_device_eval_batch_size = 1
    per_device_test_batch_size = 1
    num_warmup_steps = 0
    cache_dir = "./output/cache"
    overwrite_cache = True
    seed = 12345
    
    ignore_pad_token_for_loss = True
    preprocessing_num_workers = None
    overwrite_cache = None
    num_beams = None
    pad_to_max_length = True
    config_name = None
    tokenizer_name = "t5-base"
    use_slow_tokenizer = True
    max_train_steps = None
    lr_scheduler_type = "linear"
    shuffle = False
    
args=Args()

In [3]:
# =  =  =  =  =  =  =  =  =  = Logging Setup =  =  =  =  =  =  =  =  =  =  =  = 
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

In [4]:
# =  =  =  =  =  =  =  =  =  = Pre-check Package Info =  =  =  =  =  =  =  =  =  =  =  = 
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    if is_offline_mode():
        raise LookupError(
            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
        )
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)

In [None]:
# = = = = = = = = = = = = = Main Process = = = = = = = = = = = = = = = = = =
# Initialize the accelerator. The accelerator will handle device placement for us.
accelerator = Accelerator()
logger.info(accelerator.state)

# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:
    #datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()
else:
    #datasets.utils.logging.set_verbosity_error()
    transformers.utils.logging.set_verbosity_error()

# If passed along, set the training seed now.
if args.seed is not None:
    set_seed(args.seed)
    torch.backends.cudnn.enabled = False 
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

if accelerator.is_main_process:
    if args.output_dir is not None:
        os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()

# load raw dataset
raw_datasets = raw_data_loader(args)

# load model (config, tokenizer, s2s model)
config, tokenizer, model = model_loader(accelerator, logger, args)

# data processor (for DataLoader)
dataloader, processed_dataset = data_processor(logger, args, accelerator, raw_datasets, tokenizer, model)
train_dataloader, eval_dataloader, test_dataloader = dataloader
train_dataset, _, _ = processed_dataset

unwrapped_model = accelerator.unwrap_model(model)
config          = config.from_pretrained(args.output_dir+'/best')
tokenizer       = tokenizer.from_pretrained(args.output_dir+'/best', config=config)
unwrapped_model = unwrapped_model.from_pretrained(args.output_dir+'/best', config=config)
model           = accelerator.prepare(unwrapped_model)

## Define Functions

In [7]:
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  = EVAL =  =  =  =  =  =  =  =  =  =  =  =  =  =  = 
def generate_new(input_text):
    model.eval()

    with torch.no_grad():
        tokenized = tokenizer([input_text], max_length=args.max_source_length, padding='max_length', truncation=True)
        inputs, mask = tokenized["input_ids"], tokenized["attention_mask"]
        inputs, mask = torch.tensor(inputs), torch.tensor(mask)
        inputs = inputs.to(device="cuda:0")
        mask = mask.to(device="cuda:0")

        generated_tokens = accelerator.unwrap_model(model).generate(inputs, attention_mask = mask)
        generated_tokens = accelerator.pad_across_processes(generated_tokens, dim=1, pad_index=tokenizer.pad_token_id)
        generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
        dialogue_output = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        
    return dialogue_output

In [8]:
def string_overlap(summary_list, utterance_list):
    count = 0
    
    for word in utterance_list:
        if word in summary_list:
            count += 1
    
    return count

In [9]:
def length_bucket(utterance_list):
    length = len(utterance_list)
    
    if length <= 4:
        return "S"
    if length > 10:
        return "L"
    
    return "M"

In [10]:
def generate_input(summary, dialogue, i):
    dialogue_sep = dialogue.split('\n')
    length = len(dialogue_sep)

    input_str = "Summary - " + summary + "\n" + "Dialogue - \n"

    speaker = dialogue_sep[i].split(':')[0]
    target = dialogue_sep[i].split(':')[1]

    speaker = "Speaker - " + speaker + "\n"

    overlap = string_overlap(summary.split(), target.split())
    total = len(summary.split())
    add_info = "Overlap - " + str(overlap) + ", Total - " + str(total) + "\n"

    length_info = "Length - " + length_bucket(target.split())

    temp_dialogue = dialogue_sep.copy()
    temp_dialogue[i] = "<mask>"
    temp_dialogue = '\n'.join(temp_dialogue)

    return input_str + temp_dialogue + '\n\n' + speaker + add_info + length_info

In [11]:
def generate_new_dialogue(dialogue, i, new_utterance):
    dialogue_sep = dialogue.split('\n')

    speaker = dialogue_sep[i].split(':')[0]
    target = dialogue_sep[i].split(':')[1]

    temp_dialogue = dialogue_sep.copy()
    temp_dialogue[i] = speaker + ":" + new_utterance
    temp_dialogue = '\n'.join(temp_dialogue)

    return temp_dialogue

## Generate New Dialogues

In [12]:
import random
import json

In [13]:
file_path = "data/dialogsum/dialogsum.train.jsonl"

In [14]:
data = []

with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))

id_list       = [sample['fname'] for sample in data]
dialogue_list = [sample['dialogue'] for sample in data]
summary_list  = [sample['summary'] for sample in data]

In [15]:
dialogue = dialogue_list[123]
print(dialogue)

#Person1#: Do you have any experience working with a computer?
#Person2#: Yes. I have been a data entry operator for three years.
#Person1#: What kind of software can you use?
#Person2#: I have working knowledge of Windows and Dos. Actually, I'm quite familiar with both Java and C Programming Languages.
#Person1#: Do you have any other computer qualifications?
#Person2#: I have an ACRE certificate, GRADE 2.
#Person1#: Do you know how to use a PC to process the management information?
#Person2#: I'm sorry to say I'm not familiar with processing management information, but I'm sure I could learn quite quickly. It can't be too difficult, and I've got a quick mind. I can handle any problem you give me.


In [16]:
summary = summary_list[123]
print(summary)

#Person1# interviews #Person2# who has been a data entry operator for three years. #Person2# knows how to use the software, has computer qualifications, and can learn quite quickly.


In [17]:
i = 2

In [18]:
model_input = generate_input(summary, dialogue, i)
print(model_input)

Summary - #Person1# interviews #Person2# who has been a data entry operator for three years. #Person2# knows how to use the software, has computer qualifications, and can learn quite quickly.
Dialogue - 
#Person1#: Do you have any experience working with a computer?
#Person2#: Yes. I have been a data entry operator for three years.
<mask>
#Person2#: I have working knowledge of Windows and Dos. Actually, I'm quite familiar with both Java and C Programming Languages.
#Person1#: Do you have any other computer qualifications?
#Person2#: I have an ACRE certificate, GRADE 2.
#Person1#: Do you know how to use a PC to process the management information?
#Person2#: I'm sorry to say I'm not familiar with processing management information, but I'm sure I could learn quite quickly. It can't be too difficult, and I've got a quick mind. I can handle any problem you give me.

Speaker - #Person1#
Overlap - 1, Total - 28
Length - M


In [19]:
output = generate_new(model_input)
print(output)

 Do you have any experience in using the software?


In [20]:
new_dialogue = generate_new_dialogue(dialogue, i, output)
print(new_dialogue)

#Person1#: Do you have any experience working with a computer?
#Person2#: Yes. I have been a data entry operator for three years.
#Person1#: Do you have any experience in using the software?
#Person2#: I have working knowledge of Windows and Dos. Actually, I'm quite familiar with both Java and C Programming Languages.
#Person1#: Do you have any other computer qualifications?
#Person2#: I have an ACRE certificate, GRADE 2.
#Person1#: Do you know how to use a PC to process the management information?
#Person2#: I'm sorry to say I'm not familiar with processing management information, but I'm sure I could learn quite quickly. It can't be too difficult, and I've got a quick mind. I can handle any problem you give me.


## Replace ONE utterance

In [77]:
list_len = len(dialogue_list)
list_len

12460

In [None]:
new_list = []

for i in range(list_len):
    dialogue = dialogue_list[i]
    summary = summary_list[i]
    
    dialogue_sep = dialogue.split('\n')
    length = len(dialogue_sep)
    
    j = random.randint(0, length-1)
    
    model_input = generate_input(summary, dialogue, j)
    output = generate_new(model_input)
    new_dialogue = generate_new_dialogue(dialogue, j, output)
    
    dct = {}
    dct['fname'] = 'extra_' + str(i)
    dct['dialogue'] = new_dialogue
    dct['summary'] = summary
    
    
    new_list.append(dct)

In [79]:
x = 12345

In [80]:
print(new_list[x]['dialogue'])

#Person1#: Do you think Mr. Becket would be qualified for this job?
#Person2#: Mr. Becket? I'm not sure. He is a nice fellow, of course, and easy to get along with. But I doubt his professional expertise. I want someone who can get the job done.


In [81]:
print(dialogue_list[x])

#Person1#: Who do you think should get the job? How about Mr. Becket?
#Person2#: Mr. Becket? I'm not sure. He is a nice fellow, of course, and easy to get along with. But I doubt his professional expertise. I want someone who can get the job done.


In [82]:
print(new_list[x]['summary'])

#Person2# doesn't think Mr. Becket is qualified for the job


In [83]:
print(summary_list[x])

#Person2# doesn't think Mr. Becket is qualified for the job


In [84]:
data[0]

{'fname': 'train_0',
 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'summary': "Mr. Smi

In [85]:
new_list[x]

{'fname': 'extra_12345',
 'dialogue': "#Person1#: Do you think Mr. Becket would be qualified for this job?\n#Person2#: Mr. Becket? I'm not sure. He is a nice fellow, of course, and easy to get along with. But I doubt his professional expertise. I want someone who can get the job done.",
 'summary': "#Person2# doesn't think Mr. Becket is qualified for the job"}

## Generate Fixed Utterance Lists

In [69]:
def utterance_list_from_length(length, mask_perc):
    list_sz = int(length * mask_perc)
    if list_sz < 1:
        list_sz = 1
    
    rand_list = np.arange(length)
    np.random.shuffle(rand_list)
    rand_list = rand_list[:list_sz].tolist()
    rand_list.sort()
    
    return rand_list

In [71]:
utterance_list_from_length(15, 0.4)

[2, 3, 6, 10, 11, 14]

In [72]:
list_len = len(dialogue_list)
list_len

12460

In [73]:
utterance_list_arr = []

for i in range(list_len):
    dialogue = dialogue_list[i]
    dialogue_sep = dialogue.split('\n')
    length = len(dialogue_sep)
    
    utterance_list = utterance_list_from_length(length, 0.4)
    
    utterance_list_arr.append(utterance_list)

In [78]:
# with open("fixed_utterance_list.json", "w") as f:
#     json.dump(utterance_list_arr, f)

## Replace 30-40%

In [22]:
with open("fixed_utterance_list.json") as f:
    fixed_utterance_list = json.load(f)

In [None]:
new_list = []

for i in range(1000):
    dialogue = dialogue_list[i]
    summary = summary_list[i]
    
    dialogue_sep = dialogue.split('\n')
    length = len(dialogue_sep)
    
    utterance_list = fixed_utterance_list[i]
    
    for j in utterance_list:
        model_input = generate_input(summary, dialogue, j)
        output = generate_new(model_input)
        output = output.replace('\n'," ")
        dialogue = generate_new_dialogue(dialogue, j, output)
    
    dct = {}
    dct['fname'] = 'extra_' + str(i)
    dct['dialogue'] = dialogue
    dct['summary'] = summary
    
    new_list.append(dct)

In [25]:
x = 69

In [26]:
print(new_list[x]['dialogue'])

#Person1#: Hello. I would like to ask you about the customer service.
#Person2#: Yes, I have a problem with the vacuum. It is broken.
#Person1#: Is it under warranty?
#Person2#: I think so. I bought it four months ago.
#Person1#: Yes, it is still covered by our warranty. Tell me the mode number of your vacuum, please.
#Person2#: Okay. The model number is 6594 - c.
#Person1#: What is the name of your office?
#Person2#: 906 Ottawa street. My name is David Yang. My phone number is 713-786-0234.
#Person1#: Okay. There are two Customer Service Offices in your area. The nearest one is Chadwick and Hacks Appliances.
#Person2#: Could you tell me where the office is located?
#Person1#: Sure. 878 Fennel South.
#Person2#: Okay. I will call them right away.
#Person1#: Please let me know when you have the time.
#Person2#: Okay. Thank you for your help.
#Person1#: My pleasure.


In [27]:
print(dialogue_list[x])

#Person1#: Hi. This is the Customer Service. How can I help you?
#Person2#: Hi. I bought one of your vacuums from spend-wart. It's broken now.
#Person1#: Is it under warranty?
#Person2#: I think so. I bought it four months ago.
#Person1#: Yes, it is still covered by our warranty. Tell me the mode number of your vacuum, please.
#Person2#: Okay. The model number is 6594 - c.
#Person1#: What's your address, your name and your phone number?
#Person2#: 906 Ottawa street. My name is David Yang. My phone number is 713-786-0234.
#Person1#: Okay. There are two Customer Service Offices in your area. The nearest one is Chadwick and Hacks Appliances.
#Person2#: Could you tell me where the office is located?
#Person1#: Sure. 878 Fennel South.
#Person2#: Oh, I know that place. It's only two minutes drive.
#Person1#: You have to call the office first.
#Person2#: All right. Thank you very much for your help.
#Person1#: My pleasure.


In [28]:
print(new_list[x]['summary'])

#Person2# phones the Customer Service because #Person2#'s vacuum's broken. #Person1# answers the phone, asks for more details, and tells #Person1# the location of the nearest Customer Service Office.


In [29]:
print(summary_list[x])

#Person2# phones the Customer Service because #Person2#'s vacuum's broken. #Person1# answers the phone, asks for more details, and tells #Person1# the location of the nearest Customer Service Office.


In [30]:
data[0]

{'fname': 'train_0',
 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'summary': "Mr. Smi

In [31]:
new_list[x]

{'fname': 'extra_69',
 'dialogue': '#Person1#: Hello. I would like to ask you about the customer service.\n#Person2#: Yes, I have a problem with the vacuum. It is broken.\n#Person1#: Is it under warranty?\n#Person2#: I think so. I bought it four months ago.\n#Person1#: Yes, it is still covered by our warranty. Tell me the mode number of your vacuum, please.\n#Person2#: Okay. The model number is 6594 - c.\n#Person1#: What is the name of your office?\n#Person2#: 906 Ottawa street. My name is David Yang. My phone number is 713-786-0234.\n#Person1#: Okay. There are two Customer Service Offices in your area. The nearest one is Chadwick and Hacks Appliances.\n#Person2#: Could you tell me where the office is located?\n#Person1#: Sure. 878 Fennel South.\n#Person2#: Okay. I will call them right away.\n#Person1#: Please let me know when you have the time.\n#Person2#: Okay. Thank you for your help.\n#Person1#: My pleasure.',
 'summary': "#Person2# phones the Customer Service because #Person2#'s v

In [32]:
with open('data/dialogsum/dialogsum.finetune0-500.jsonl', 'w') as outfile:
    for entry in new_list[:500]:
        json.dump(entry, outfile)
        outfile.write('\n')

In [33]:
file_path = 'data/dialogsum/dialogsum.finetune0-500.jsonl'

data = []

with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))

id_list       = [sample['fname'] for sample in data]
dialogue_list = [sample['dialogue'] for sample in data]
summary_list  = [sample['summary'] for sample in data]

In [34]:
print(dialogue_list[x])

#Person1#: Hello. I would like to ask you about the customer service.
#Person2#: Yes, I have a problem with the vacuum. It is broken.
#Person1#: Is it under warranty?
#Person2#: I think so. I bought it four months ago.
#Person1#: Yes, it is still covered by our warranty. Tell me the mode number of your vacuum, please.
#Person2#: Okay. The model number is 6594 - c.
#Person1#: What is the name of your office?
#Person2#: 906 Ottawa street. My name is David Yang. My phone number is 713-786-0234.
#Person1#: Okay. There are two Customer Service Offices in your area. The nearest one is Chadwick and Hacks Appliances.
#Person2#: Could you tell me where the office is located?
#Person1#: Sure. 878 Fennel South.
#Person2#: Okay. I will call them right away.
#Person1#: Please let me know when you have the time.
#Person2#: Okay. Thank you for your help.
#Person1#: My pleasure.


In [35]:
print(summary_list[x])

#Person2# phones the Customer Service because #Person2#'s vacuum's broken. #Person1# answers the phone, asks for more details, and tells #Person1# the location of the nearest Customer Service Office.
