In [4]:
# Upgrade pip
%pip install --upgrade pip

# Install torch and torchdata with specified versions
%pip install --disable-pip-version-check torch==1.13.1 torchdata==0.5.1 --quiet

# Install transformers and datasets
%pip install transformers==4.27.2 datasets==2.11.0 --quiet


[0m

In [5]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [6]:
huggingface_dataset_name = "knkarthick/dialogsum"

# Load the dataset
dataset = load_dataset(huggingface_dataset_name)



  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
def print_example(index, dash_length=80):
    dash_line = '-' * dash_length

    print(dash_line)
    print(f'Example {index + 1}')
    print(dash_line)


    input_dialogue = dataset['test'][index]['dialogue']
    baseline_summary = dataset['test'][index]['summary']

    print('INPUT DIALOGUE:')
    print(input_dialogue)
    print(dash_line)
    print('BASELINE HUMAN SUMMARY:')
    print(baseline_summary)
    print(dash_line)
    print()



example_indices = [30, 150]
custom_dash_length = 120

for i, index in enumerate(example_indices):
    print_example(index, custom_dash_length)


------------------------------------------------------------------------------------------------------------------------
Example 31
------------------------------------------------------------------------------------------------------------------------
INPUT DIALOGUE:
#Person1#: Where are you going for your trip?
#Person2#: I think Hebei is a good place.
#Person1#: But I heard the north of China are experiencing severe sandstorms!
#Person2#: Really?
#Person1#: Yes, it's said that Hebes was experiencing six degree strong winds.
#Person2#: How do these storms affect the people who live in these areas?
#Person1#: The report said the number of people with respiratory tract infections tended to rise after sandstorms. The sand gets into people's noses and throats and creates irritation.
#Person2#: It sounds that sandstorms are trouble for everybody!
#Person1#: You are quite right.
----------------------------------------------------------------------------------------------------------------

In [8]:
model_name = 'google/flan-t5-base'

#Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [9]:
def encode_decode_sentence(sentence, tokenizer):
    # Encode the sentence
    sentence_encoded = tokenizer(sentence, return_tensors='pt')

    # Decode the encoded sentence
    sentence_decoded = tokenizer.decode(
        sentence_encoded["input_ids"][0],
        skip_special_tokens=True
    )

    return sentence_encoded, sentence_decoded

# Change sentence value
new_sentence = "How are you doing today?"
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')

sentence_encoded, sentence_decoded = encode_decode_sentence(new_sentence, tokenizer)

print('ENCODED SENTENCE:')
print(sentence_encoded["input_ids"][0])
print('\nDECODED SENTENCE:')
print(sentence_decoded)

ENCODED SENTENCE:
tensor([571,  33,  25, 692, 469,  58,   1])

DECODED SENTENCE:
How are you doing today?


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def generate_summary(dialogue, model, tokenizer, max_tokens=50):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=max_tokens,
        )[0],
        skip_special_tokens=True
    )

    return output

example_indices = [30, 150]
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base')

for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    generated_summary = generate_summary(dialogue, model, tokenizer)

    # Print or store the generated summary for analysis
    print(f'MODEL GENERATION - ZERO SHOT:\n{generated_summary}\n')

    dash_line = '-' * 80
    print(dash_line)
    print(f'Example {i + 1}')
    print(dash_line)
    print(f'INPUT DIALOGUE:\n{dialogue}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{generated_summary}\n')




MODEL GENERATION - ZERO SHOT:
The sandstorms in China are causing a lot of health problems for people in the north of China.

--------------------------------------------------------------------------------
Example 1
--------------------------------------------------------------------------------
INPUT DIALOGUE:
#Person1#: Where are you going for your trip?
#Person2#: I think Hebei is a good place.
#Person1#: But I heard the north of China are experiencing severe sandstorms!
#Person2#: Really?
#Person1#: Yes, it's said that Hebes was experiencing six degree strong winds.
#Person2#: How do these storms affect the people who live in these areas?
#Person1#: The report said the number of people with respiratory tract infections tended to rise after sandstorms. The sand gets into people's noses and throats and creates irritation.
#Person2#: It sounds that sandstorms are trouble for everybody!
#Person1#: You are quite right.
-------------------------------------------------------------------

Summary

In [16]:
def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ''
    for index in example_indices_full:
        dialogue = dataset['test'][index]['dialogue']
        summary = dataset['test'][index]['summary']

        # The stop sequence '{summary}\n\n\n' is important for FLAN-T5. Other models may have their own preferred stop sequence.
        prompt += f"""
Dialogue:

{dialogue}

What was going on?
{summary}


"""

    dialogue = dataset['test'][example_index_to_summarize]['dialogue']

    prompt += f"""
Dialogue:

{dialogue}

What was going on?
"""

    return prompt

In [19]:
example_indices_full = [10, 50, 90]
example_index_to_summarize = 160

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)



Dialogue:

#Person1#: Happy Birthday, this is for you, Brian.
#Person2#: I'm so happy you remember, please come in and enjoy the party. Everyone's here, I'm sure you have a good time.
#Person1#: Brian, may I have a pleasure to have a dance with you?
#Person2#: Ok.
#Person1#: This is really wonderful party.
#Person2#: Yes, you are always popular with everyone. and you look very pretty today.
#Person1#: Thanks, that's very kind of you to say. I hope my necklace goes with my dress, and they both make me look good I feel.
#Person2#: You look great, you are absolutely glowing.
#Person1#: Thanks, this is a fine party. We should have a drink together to celebrate your birthday

What was going on?
#Person1# attends Brian's birthday party. Brian thinks #Person1# looks great and charming.



Dialogue:

#Person1#: Yeah. Just pull on this strip. Then peel off the back.
#Person2#: You might make a few enemies this way.
#Person1#: If they don't think this is fun, they're not meant to be our friends

In [21]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0],
    skip_special_tokens=True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')

--------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# and #Person2# are talking about Lulu and Vic's breakup.

--------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
Lulu and Vic broke up and now they are dating.


In [22]:
from transformers import GenerationConfig
generation_config = GenerationConfig(max_new_tokens=50)
# generation_config = GenerationConfig(max_new_tokens=10)
# generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.1)
# generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.5)
# generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=1.0)

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config=generation_config,
    )[0],
    skip_special_tokens=True
)

print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')

--------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
Lulu and Vic broke up and now they are dating.
--------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# and #Person2# are talking about Lulu and Vic's breakup.

