In [3]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig

In [4]:
hf_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(hf_dataset_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 12460/12460 [00:00<00:00, 50514.39 examples/s]
Generating validation split: 100%|██████████| 500/500 [00:00<00:00, 32716.88 examples/s]
Generating test split: 100%|██████████| 1500/1500 [00:00<00:00, 52566.35 examples/s]


In [14]:
example_indices = [40, 200]

dash_line = '-'.join('' for x in range(100))
print(dataset['test'][0].keys())

for i, index in enumerate(example_indices):

    print(dash_line)
    print('Example ', i+1)
    print(dash_line)
    print('Input Dialog: ')
    print(dataset['test'][index]['dialogue'])
    print(dash_line)
    print('Human Summary: ', end="")
    print(dataset['test'][index]['summary'])
    print(dash_line)
    print()

dict_keys(['id', 'dialogue', 'summary', 'topic'])
---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
Input Dialog: 
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
Human Summary: #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------

---------------------------------------------------------------

In [21]:
model_name = 'google/flan-t5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [20]:
sentence = "What time is it, Tom?"

sentece_encoded = tokenizer.encode(sentence, return_tensors='pt')

sentence_decoded = tokenizer.decode(sentece_encoded[0], skip_special_tokens=True)

print("Encoded sentence: ", sentece_encoded)

print("Decoded token: ", sentence_decoded)

Encoded sentence:  tensor([[ 363,   97,   19,   34,    6, 3059,   58,    1]])
Decoded token:  What time is it, Tom?


In [44]:
# Define a printing helper function
def print_output(dialogue, summary, model_output):
    print("Dialoge: ")
    print(dialogue)
    print(dash_line)
    print("Summary: ", end="")
    print(summary)
    print("Model Output: ", end="")
    print(model_output)
    print(dash_line)
    print()


In [45]:
# See Model output without any prompt

for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']
    tokenized_dialogue = tokenizer(dialogue, return_tensors='pt')

    model_output = tokenizer.decode(model.generate(tokenized_dialogue['input_ids'], max_new_tokens=50)[0], skip_special_tokens=True)
    print_output(dialogue, summary, model_output)
    


Dialoge: 
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
Summary: #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
Model Output: Person1: It's ten to nine.
---------------------------------------------------------------------------------------------------

Dialoge: 
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would

In [48]:
# See model output with prompt to summarize the given input

for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']
    # input_prompt = prompt + dialogue
    input_prompt = f"""
    Summarize the following conversation:
    {dialogue}
    Summary: 
    """
    tokenized_dialogue = tokenizer(input_prompt , return_tensors='pt')

    model_output = tokenizer.decode(model.generate(tokenized_dialogue['input_ids'], max_new_tokens=50)[0], skip_special_tokens=True)
    print_output(input_prompt, summary, model_output)

Dialoge: 

    Summarize the following conversation:
    #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
    Summary: 
    
---------------------------------------------------------------------------------------------------
Summary: #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
Model Output: The train is about to leave.
---------------------------------------------------------------------------------------------------

Dialoge: 

    Summarize the following conversation:
    #Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program

In [60]:
def make_prompt(list_idx, dataset, idx_to_sum):
    output_prompt = ""
    for i, index in enumerate(list_idx):
        dialogue = dataset[index]['dialogue']
        summary = dataset[index]['summary']
        prompt = f"""Dialogue: 
                        {dialogue}
                     Summary: 
                        {summary}
                    """  
        output_prompt += prompt

    output_prompt += f"Dialoge: {dataset[idx_to_sum]['dialogue']} \n Summary: " 
    return output_prompt

print(make_prompt([30, 32], dataset['test'], 31))


Dialogue: 
                        #Person1#: Where are you going for your trip?
#Person2#: I think Hebei is a good place.
#Person1#: But I heard the north of China are experiencing severe sandstorms!
#Person2#: Really?
#Person1#: Yes, it's said that Hebes was experiencing six degree strong winds.
#Person2#: How do these storms affect the people who live in these areas?
#Person1#: The report said the number of people with respiratory tract infections tended to rise after sandstorms. The sand gets into people's noses and throats and creates irritation.
#Person2#: It sounds that sandstorms are trouble for everybody!
#Person1#: You are quite right.
                     Summary: 
                        #Person2# plans to have a trip in Hebei but #Person1# says there are sandstorms in there.
                    Dialogue: 
                        #Person1#: Where are you going for your trip?
#Person2#: I think Hebei is a good place.
#Person1#: But I heard the north of China are experiencing

In [61]:
input_prompt = make_prompt([20], dataset['test'], 21)
tokenized_input = tokenizer(input_prompt, return_tensors='pt')
model_output = tokenizer.decode(model.generate(tokenized_input['input_ids'], max_new_tokens=50)[0], skip_special_tokens=True)

print(input_prompt)
print(model_output)


Dialogue: 
                        #Person1#: What's wrong with you? Why are you scratching so much?
#Person2#: I feel itchy! I can't stand it anymore! I think I may be coming down with something. I feel lightheaded and weak.
#Person1#: Let me have a look. Whoa! Get away from me!
#Person2#: What's wrong?
#Person1#: I think you have chicken pox! You are contagious! Get away! Don't breathe on me!
#Person2#: Maybe it's just a rash or an allergy! We can't be sure until I see a doctor.
#Person1#: Well in the meantime you are a biohazard! I didn't get it when I was a kid and I've heard that you can even die if you get it as an adult!
#Person2#: Are you serious? You always blow things out of proportion. In any case, I think I'll go take an oatmeal bath.
                     Summary: 
                        #Person1# thinks #Person2# has chicken pox and warns #Person2# about the possible hazards but #Person2# thinks it will be fine.
                    Dialoge: #Person1#: Good coming. What ca