In [None]:
!pip install bitsandbytes
!pip install transformers
!pip install peft
!pip install accelerate
!pip install datasets
!pip install scipy
!pip install einops
!pip install evaluate
!pip install trl
!pip install rouge_score
!pip install transformers[sentencepiece]
!pip install scikit-learn



In [20]:
import time
import torch
import pandas as pd
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel, LoraConfig
from trl import AutoModelForSeq2SeqLMWithValueHead

from assets.config import config

In [21]:
huggingface_dataset_name = config["dataset_name"]
print(huggingface_dataset_name)

dataset = load_dataset(huggingface_dataset_name)
dataset["train"][0]

knkarthick/dialogsum


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

{'id': 'train_0',
 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'summary': "Mr. Smith'

In [22]:
# Load the model and tokenizer
model_name = config['model_name']
print(model_name)


original_model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False ,trust_remote_code=True, padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

google-t5/t5-small




config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
from utils import ZeroShotSummarizer

summarizer = ZeroShotSummarizer(model=original_model)

idx = 1
prompt = dataset['test'][idx]['dialogue']
summary = dataset['test'][idx]['summary']

# Format the prompt for the model
formatted_prompt = f"Instruct: Summarize the following conversation.\n{prompt}\nOutput:\n"

output_summary = summarizer.summarize(tokenizer=tokenizer,prompt=formatted_prompt)

# Printing results
hash_line = '#' * 100  # Simplified line generation
print(hash_line)
print(f'INPUT PROMPT:\n{formatted_prompt}')
print(hash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(hash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output_summary}')


####################################################################################################
INPUT PROMPT:
Instruct: Summarize the following conversation.
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to

In [24]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction','output')
    Then concatenate them using two newline characters
    :param sample: Sample dictionnary
    """
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruct: Summarize the below conversation."
    RESPONSE_KEY = "### Output:"
    END_KEY = "### End"

    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{sample['dialogue']}" if sample["dialogue"] else None
    response = f"{RESPONSE_KEY}\n{sample['summary']}"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt

    return sample

In [25]:
from functools import partial

def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int,seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)

    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=['id', 'topic', 'dialogue', 'summary'],
    )

    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    dataset = dataset.shuffle(seed=seed)

    return dataset

In [26]:

seed = 42
max_length = get_max_length(original_model)
print(max_length)

train_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['validation'])

Found max lenth: 512
512
Preprocessing dataset...


Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Preprocessing dataset...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [27]:
from model import PEFTFineTuner

output_dir = f'assets/model/peft-dialogue-summary-training-{str(int(time.time()))}'

config["peft_training_args"]['max_steps'] = 10000
config["peft_training_args"]['learning_rate'] = 2e-4
config["peft_training_args"]['logging_steps'] = 100
config["peft_training_args"]['save_steps'] = 500
config["peft_training_args"]['eval_steps'] = 100

peft_trainer = PEFTFineTuner(opt = config , model = original_model, tokenizer = tokenizer)
peft_model_bf, trainer = peft_trainer.fine_tune(output_dir, train_dataset, eval_dataset)

max_steps is given, it will override any value given in num_train_epochs


In [28]:
from utils import ModelSummary

summary = ModelSummary(peft_model_bf)
summary.print_summary()


Trainable model parameters: 3145728
All model parameters: 63652352
Percentage of trainable model parameters: 4.94%


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
100,2.2378,0.374582
200,0.3807,0.217209
300,0.2548,0.135599
400,0.1704,0.094913
500,0.1386,0.068837
600,0.1085,0.058068
700,0.0924,0.049612
800,0.0796,0.047202
900,0.0706,0.043998
1000,0.0662,0.040856


TrainOutput(global_step=10000, training_loss=0.05189571918845177, metrics={'train_runtime': 5331.8108, 'train_samples_per_second': 7.502, 'train_steps_per_second': 1.876, 'total_flos': 3239601650368512.0, 'train_loss': 0.05189571918845177, 'epoch': 3.400493071495367})

In [29]:
from utils import ZeroShotSummarizer


peft_model_at = PeftModel.from_pretrained(peft_model_bf, "assets/model/peft_model",torch_dtype=torch.float16,is_trainable=False)

summarizer = ZeroShotSummarizer(model=peft_model_at)

idx = 1
prompt = dataset['test'][idx]['dialogue']
summary = dataset['test'][idx]['summary']

formatted_prompt = f"Instruct: Summarize the following conversation.\n{prompt}\nOutput:\n"
output_summary = summarizer.summarize(tokenizer=tokenizer,prompt=formatted_prompt)

hash_line = '#' * 100  
print(hash_line)
print(f'INPUT PROMPT:\n{formatted_prompt}')
print(hash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(hash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output_summary}')

####################################################################################################
INPUT PROMPT:
Instruct: Summarize the following conversation.
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to

In [30]:
base_model_id = config['model_name']
print(base_model_id)

base_model = T5ForConditionalGeneration.from_pretrained(base_model_id,
                                                      device_map='auto')

google-t5/t5-small




In [31]:
eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True, use_fast=False)
eval_tokenizer.pad_token = eval_tokenizer.eos_token

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [32]:
lora_config = LoraConfig(
            r=config["LoraConfig"]["rank"],
            lora_alpha=config["LoraConfig"]["lora_alpha"],
            target_modules=["q", "v"],
            bias=config["LoraConfig"]["bias"],
            lora_dropout=config["LoraConfig"]["lora_dropout"],
            task_type=config["LoraConfig"]["task_type"],
        )

base_model_id = config["model_name"]
base_model = T5ForConditionalGeneration.from_pretrained(base_model_id, torch_dtype=torch.float16)


peft_model = PeftModel.from_pretrained(base_model, "assets/model/peft_model", lora_config=lora_config, torch_dtype=torch.float16,is_trainable=False)

In [33]:
from utils import ModelSummary


ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model,
                                                               torch_dtype=torch.bfloat16,
                                                               is_trainable=True)

summary = ModelSummary(ppo_model)
summary.print_summary()
print(ppo_model.v_head)


Trainable model parameters: 513
All model parameters: 63652865
Percentage of trainable model parameters: 0.00%
ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=512, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


In [34]:
from trl import create_reference_model

ref_model = create_reference_model(ppo_model)


summary = ModelSummary(ref_model)
summary.print_summary()

print(ppo_model.v_head)


Trainable model parameters: 0
All model parameters: 63652865
Percentage of trainable model parameters: 0.00%
ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=512, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


In [35]:
toxicity_model_name = config["toxic_model_name"]
print(toxicity_model_name)

toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map="auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name)
print(toxicity_model.config.id2label)


DaNLP/da-electra-hatespeech-detection


tokenizer_config.json:   0%|          | 0.00/388 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/885 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/239k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/55.0M [00:00<?, ?B/s]

{0: 'not offensive', 1: 'offensive'}


In [36]:
from model import ToxicityEvaluator

device = "cuda" if torch.cuda.is_available() else "cpu"

toxicity_eval = ToxicityEvaluator(tokenizer = toxicity_tokenizer, toxicity_model_name = toxicity_model_name, toxicity_model=toxicity_model,  device = device)

toxicity_evaluator = toxicity_eval.get_evaluator()

mean_toxicity, std_toxicity = toxicity_eval.evaluate_toxicity(model = ref_model, dataset= dataset['test'], num_samples=100)

print(f'Toxicity [mean, std]: [{mean_toxicity}, {std_toxicity}]')


Downloading builder script:   0%|          | 0.00/6.08k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

100%|██████████| 100/100 [13:54<00:00,  8.35s/it]

Toxicity [mean, std]: [0.0011666345334379003, 0.002248152037386937]





In [38]:
from data import DatasetPreprocessor
# from assets.config import config
from config import config

preprocessor = DatasetPreprocessor(
        model_name=config["model_name"],
        dataset_name=config["dataset_name"],
        input_min_length=200,
        input_max_length=1000
    )

# Explicitly call preprocessing before splitting
preprocessor.preprocess_dataset()

dataset = preprocessor.get_dataset_splits()
print(dataset)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/10022 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 8017
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 2005
    })
})


In [39]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

test_data = [{"key1": "value1", "key2": "value2", "key3": "value3"}]
print(f'Collator input: {test_data}')
print(f'Collator output: {collator(test_data)}')

Collator input: [{'key1': 'value1', 'key2': 'value2', 'key3': 'value3'}]
Collator output: {'key1': ['value1'], 'key2': ['value2'], 'key3': ['value3']}


In [None]:
from assets.config import config
from ppotrainer import PPO_DialogueTrainer

model_name=config["model_name"]
toxicity_model_name = config["toxic_model_name"]


learning_rate =  1.41e-5
max_ppo_epochs = config["ppo_config"]["max_ppo_epochs"]
mini_batch_size = config["ppo_config"]["mini_batch_size"]
batch_size = config["ppo_config"]["batch_size"]

# Create a trainer object
trainer = PPO_DialogueTrainer(model_name = model_name, model=ppo_model, ref_model = ref_model, tokenizer = tokenizer, dataset = dataset, data_collator = collator, learning_rate = learning_rate, max_ppo_epochs =max_ppo_epochs, mini_batch_size = mini_batch_size, batch_size = batch_size, toxicity_model_name=toxicity_model_name)

sentiment_pipe = trainer.get_sentiment_pipeline()

# Train the model
output_min_length = 100
output_max_length = 200
max_ppo_steps = 10
trainer.train(output_min_length, output_max_length, max_ppo_steps)

# Save the trained model (optional)
trainer.save_model("assets/model/ppo_model/ppo_model")


  0%|          | 0/10 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (186 > 128). Running this sequence through the model will result in indexing errors
 10%|█         | 1/10 [00:27<04:03, 27.07s/it]

objective/kl: 123.11963653564453
ppo/returns/mean: -4.029711723327637
ppo/policy/advantages_mean: 0.002601243555545807
----------------------------------------------------------------------------------------------------


 20%|██        | 2/10 [00:56<03:49, 28.74s/it]

objective/kl: 132.1934051513672
ppo/returns/mean: -4.975330352783203
ppo/policy/advantages_mean: 0.03139080852270126
----------------------------------------------------------------------------------------------------


 30%|███       | 3/10 [01:24<03:16, 28.13s/it]

objective/kl: 117.03385162353516
ppo/returns/mean: -4.891247272491455
ppo/policy/advantages_mean: -0.005236591212451458
----------------------------------------------------------------------------------------------------


 40%|████      | 4/10 [01:54<02:52, 28.78s/it]

objective/kl: 123.17713928222656
ppo/returns/mean: -4.182886123657227
ppo/policy/advantages_mean: 0.02797350287437439
----------------------------------------------------------------------------------------------------


 50%|█████     | 5/10 [02:22<02:22, 28.55s/it]

objective/kl: 118.92764282226562
ppo/returns/mean: -5.024173736572266
ppo/policy/advantages_mean: -0.05027779936790466
----------------------------------------------------------------------------------------------------


 60%|██████    | 6/10 [02:50<01:53, 28.32s/it]

objective/kl: 117.42315673828125
ppo/returns/mean: -4.6825852394104
ppo/policy/advantages_mean: 0.04437951743602753
----------------------------------------------------------------------------------------------------


 70%|███████   | 7/10 [03:18<01:25, 28.46s/it]

objective/kl: 145.5492706298828
ppo/returns/mean: -5.9684929847717285
ppo/policy/advantages_mean: 0.026434138417243958
----------------------------------------------------------------------------------------------------


 80%|████████  | 8/10 [03:48<00:57, 28.95s/it]

objective/kl: 163.52511596679688
ppo/returns/mean: -6.198254585266113
ppo/policy/advantages_mean: -0.04985509067773819
----------------------------------------------------------------------------------------------------


 90%|█████████ | 9/10 [04:17<00:28, 28.86s/it]

objective/kl: 119.75888061523438
ppo/returns/mean: -4.41843318939209
ppo/policy/advantages_mean: 0.04436187446117401
----------------------------------------------------------------------------------------------------


100%|██████████| 10/10 [04:44<00:00, 28.49s/it]

objective/kl: 90.54510498046875
ppo/returns/mean: -3.8956713676452637
ppo/policy/advantages_mean: -0.00296001136302948
----------------------------------------------------------------------------------------------------





In [None]:
from model import ToxicityEvaluator
from transformers import GenerationConfig

device = "cuda" if torch.cuda.is_available() else "cpu"

toxicity_eval = ToxicityEvaluator(tokenizer = tokenizer, toxicity_model_name = toxicity_model_name, toxicity_model=toxicity_model,  device = device)

toxicity_evaluator = toxicity_eval.get_evaluator()

mean_after_detoxification, std_after_detoxification = toxicity_eval.evaluate_toxicity(model = ppo_model, dataset= dataset['test'], num_samples=10,GenerationConfig=GenerationConfig)

print(f'Toxicity [mean, std]: [{mean_after_detoxification}, {std_after_detoxification}]')

  0%|          | 0/10 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 10/10 [00:14<00:00,  1.49s/it]

Toxicity [mean, std]: [0.10986241679638624, 0.0435850673147603]





In [50]:

mean_improvement = (mean_toxicity - mean_after_detoxification) / mean_toxicity
std_improvement = (std_toxicity - std_after_detoxification) / std_toxicity

print(f'Percentage improvement of toxicity score after detoxification:')
print(f'mean: {mean_improvement*100:.2f}%')
print(f'std: {std_improvement*100:.2f}%')

Percentage improvement of toxicity score after detoxification:
mean: -90.03%
std: -18.91%


In [78]:
from utils import ModelComparison
model_comparison = ModelComparison(tokenizer, ref_model, ppo_model, sentiment_pipe)

# Compare the models on a test dataset
comparison_results = model_comparison.compare_models(dataset, batch_size=90)

100%|██████████| 10/10 [00:55<00:00,  5.59s/it]


In [79]:
pd.set_option('display.max_colwidth', 500)
df_compare_results = pd.DataFrame(comparison_results)
df_compare_results["reward_diff"] = df_compare_results['reward_after'] - df_compare_results['reward_before']
df_compare_results_sorted = df_compare_results.sort_values(by=['reward_diff'], ascending=False).reset_index(drop=True)
df_compare_results_sorted

Unnamed: 0,query,response_before,response_after,reward_before,reward_after,reward_diff
0,"Summarize the following conversation: #Person1#: Oh, it's a fine day, isn't it? And the food smells nice. It's a perfect day for a picnic. #Person2#: Yes, it is. I'm glad it doesn't rain. My name's Mike Gates, by the way. #Person1#: Oh, hi! I'm Alice. Nice to meet you. #Person2#: Nice to meet you too. So Alice...what do you do? #Person1#: I'm studying medicine. #Person2#: Really? Where? #Person1#: At Harvard. What about you? #Person2#: I'm working for IBM. #Person1#: Oh, are you? That sounds...","Below conversation: #Person1#: Oh, it's a fine day, isn't it? And the food smells nice. It's a perfect day for a picnic. #Person2#: Yes, it is. I'm glad it doesn't rain. My name's Mike Gates, by the way. #Person1#: Oh, hi! I'm Alice. Nice to meet you. #Person2#: Nice to meet you too. So Alice...","Below the following conversation: #Person1#: Oh, it's a fine day, isn't it? And the food smells nice. It's a perfect day for a picnic. #Person2#: Yes, it is. I'm glad it doesn't rain. My name's Mike Gates, by the way. #Person1#: Oh, hi! I'm Alice. Nice to meet you. #Person2#: Nice to meet you too. So",2.155203,2.35486,0.199657
1,"Summarize the following conversation: #Person1#: Hello, Jane. #Person2#: Hi, Harry. Did you have a good summer holiday? #Person1#: Sure. I went for my holiday on my uncle's farm. #Person2#: Really? What interesting things did you do there? #Person1#: I helped get in some rice, take care of the fruit garden and drive the tractor. #Person2#: Drive a tractor? #Person1#: Yes. It was easy to learn. Did you go away for your holiday, Jane? #Person2#: Oh, no. I just stayed at home. My mother has bee...","Below conversation: #Person1#: Hello, Jane. #Person2#: Hi, Harry. Did you have a good summer holiday? #Person1#: Sure. I went for my holiday on my uncle's farm. #Person2#: Really? What interesting things did you do there? #Person1#: I helped get in some rice, take care of the fruit garden and drive the tractor. #Person2#: Drive a tractor? #Person1#: Yes. It was easy to learn. Did you go away for your holiday, Jane? #Person2#: Oh, no. I just stayed at home. My mother has been in hospital. I h...","Below is the following conversation: #Person1#: Hello, Jane. #Person2#: Hi, Harry. Did you have a good summer holiday? #Person1#: Sure. I went for my holiday on my uncle's farm. #Person2#: Really? What interesting things did you do there? #Person1#: I helped get in some rice, take care of the fruit garden and drive the tractor. #Person2#: Drive a tractor? #Person1#: Yes. It was easy to learn. Did you go away for your holiday, Jane? #Person2#: Oh, no. I just stayed at home. My mother has been...",0.944414,1.072222,0.127808
2,"Summarize the following conversation: #Person1#: Good morning, Paul! #Person2#: Hi, good morning, you are so early today. #Person1#: Yes, traffic wasn't much today, so I made it to the office earlier. #Person2#: I see. How long does it take you to reach the office every day? #Person1#: Well, about an hour or so. #Person2#: Wow, that's quite a long journey. Do you drive a car? #Person1#: No. I take public transportation. It's safer. #Person2#: You are right! But I get up a little late. That g...","Below conversation: #Person1#: Good morning, Paul! #Person2#: Hi, good morning, you are so early today. #Person1#: Yes, traffic wasn't much today, so I made it to the office earlier. #Person2#: I see. How long does it take you to reach the office every day? #Person1#: Well, about an hour or so. #Person2#: Wow, that'","Below: Good morning, Paul! #Person2#: Hi, good morning, you are so early today. #Person1#: Yes, traffic wasn't much today, so I made it to the office earlier. #Person2#: I see. How long does it take you to reach the office every day? #Person1#: Well, about an hour or so. #Person2#: Wow, that's quite a long journey.",0.272206,0.317771,0.045565
3,"Summarize the following conversation: #Person1#: Happy birthday, Ms. Lin. Here is a present for you. #Person2#: Oh, thank you! It's very kind of you. Ah, it is a porcelain vase. #Person1#: I hope you like it. #Person2#: Yes, it is lovely. I have been expecting something like this for a long time. Thanks ever so much. #Person1#: I am glad you like it. Summary:</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><...","Below is a present for you. #Person2#: Oh, thank you! It's very kind of you. Ah, it is a porcelain vase. #Person1#: I hope you like it. #Person2#: Yes, it is lovely. I have been expecting something like this for a long time. Thanks ever so much. #Person1#: I am glad you like it. Summary: Summary: Summary: Summary: Summary: I am glad you like it. Summary: I am glad you like it. Summary:: a present for you. #Person2#: Oh, thank you! It's very kind of you. Ah, it is a porcelain vase. #Person1#:...","Below is a present for you. #Person1#: Happy birthday, Ms. Lin. Here is a present for you. #Person2#: Oh, thank you! It's very kind of you. Ah, it is a porcelain vase. #Person1#: I hope you like it. #Person2#: Yes, it is lovely. I have been expecting something like this for a long time. Thanks ever so much. #Person1#: I am glad you like it. Summary: I am glad you like it. Summary:: Good you. Summary:: Happy birthday, Ms. Lin. Here is a present for you. #Person1#: Happy birthday, Ms. Ah, than...",0.229263,0.274037,0.044774
4,"Summarize the following conversation: #Person1#: Wanna chill out and watch cable? #Person2#: Wow! Besides The Cosby Show, I've never seen a black sitcom. What's this program? #Person1#: It's a drama called Soul Food. This channel is BET-Black Entertainment Television. It has black shows, news and commercials. #Person2#: This show is like a black Friends. #Person1#: BET has black shows that are overlooked by mainstream white TV. #Person2#: That's cool. I've heard LA has an all-Asian channel o...","Below The Cosby Show, I've never seen a black sitcom. What's this program? #Person1#: It's a drama called Soul Food. This channel is BET-Black Entertainment Television. It has black shows, news and commercials. #Person2#: This show is like a black Friends. #Person1#: BET has black shows that are overlooked by mainstream white TV. #Person2#: That's cool. I've heard LA has an all-Asian channel on","Below The Cosby Show, I've never seen a black sitcom. What's this program? #Person1#: Wanna chill out and watch cable? #Person2#: Wow! Besides The Cosby Show, I've never seen a black sitcom. What's this program? #Person1#: It's a drama called Soul Food. This channel is BET-Black Entertainment Television. It has black shows, news and commercials. #Person2#: This show is like",1.527877,1.530099,0.002222
5,Summarize the following conversation: #Person1#: I was going to make a cake for the party tonight. But I just realized we have run out of eggs. Could you go to the supermarket and get some? #Person2#: I don't think we have enough time to make one. Let me just buy a cake from a shop. Summary:</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>...,Below conversation: #Person1#: I was going to make a cake for the party tonight. But I just realized we have run out of eggs. Could you go to the supermarket and get some? #Person2#: I don't think we have enough time to make one. Let me just buy a cake from a shop. Summary:: I was going to make a cake for the party tonight. But I just realized we have run out of eggs. Could you,Below conversation: #Person1#: I was going to make a cake for the party tonight. But I just realized we have run out of eggs. Could you go to the supermarket and get some? #Person2#: I don't think we have enough time to make one. Let me just buy a cake from a shop. Summary: I was going to make a cake for the party tonight. But I just realized we have run out of eggs. Could you go,1.478188,1.394555,-0.083633
6,"Summarize the following conversation: #Person1#: Henry, I want to ask you something. #Person2#: Say what you want to say. #Person1#: You see. The wage is not paid yet, I am wondering if you could possibly lend me some money. #Person2#: I am sorry. But I do not have any money to lend to you. Summary:</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p...","Below conversation: #Person1#: Henry, I want to ask you something. #Person2#: Say what you want to say. #Person1#: You see. The wage is not paid yet, I am wondering if you could possibly lend me some money. #Person2#: I am sorry. But I do not have any money to lend to you. Summary: Summary: Summary: I: I am sorry. But I do not have any","Below: #Person1#: Henry, I want to ask you something. #Person2#: Say what you want to say. #Person1#: You see. The wage is not paid yet, I am wondering if you could possibly lend me some money. #Person2#: I am sorry. But I do not have any money to you. Summary:: Summary:: I have never any money to lend to you. Summary:: I would",0.992075,0.860892,-0.131182
7,"Summarize the following conversation: #Person1#: Hey Jane. How are you doing these days? #Person2#: Hi Martha. Everything is perfect. #Person1#: Are you enjoying married life? #Person2#: It couldn't be better. My husband is so supportive with my work and he helps around the house so much. #Person1#: You're so lucky. I hope to get married to a guy like that. #Person2#: I'm so happy. Even though we are both working, we get to spend a lot of time with each other in the evening and on the weeken...","Below conversation: #Person1#: Hey Jane. How are you doing these days? #Person2#: Hi Martha. Everything is perfect. #Person1#: Are you enjoying married life? #Person2#: It couldn't be better. My husband is so supportive with my work and he helps around the house so much. #Person1#: You're so lucky. I hope to get married to a guy like that. #Person2#: I'm so happy. Even though we are both working, we get to spend a lot of time with each other in the evening and on the weekends. #Person1#: Do ...","Below the following conversation: #Person1#: Hey Jane. How are you doing these days? #Person2#: Hi Martha. Everything is perfect. #Person1#: Are you enjoying married life? #Person2#: It couldn't be better. My husband is so supportive with my work and he helps around the house so much. #Person1#: You're so lucky. I hope to get married to a guy like that. #Person2#: I'm so happy. Even though we are both working, we get to spend a lot of time with each other in the evening and on the weekends. ...",1.013311,0.851555,-0.161756
8,"Summarize the following conversation: #Person1#: We do a lot of camping in the mountains. What would you recommend for two people? #Person2#: You'd probably be better off with the four reel drive vehicle. We have several off-road trucks in stock, both new and used. Summary:</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa...","Below conversation: #Person1#: We do a lot of camping in the mountains. What would you recommend for two people? #Person2#: You'd probably be better off with the four reel drive vehicle. We have several off-road trucks in stock, both new and used. Summary:: We do a lot of camping in the mountains. What would you recommend for two people? #Person2#: You'd probably be better off with the four reel drive vehicle. Summary: We have several off-road trucks in stock, both new and used. Summary:: Th...","<pad> Below conversation: #Person1#: We do a lot of camping in the mountains. What would you recommend for two people? #Person2#: You'd probably be better off with the four reel drive vehicle. We have several off-road trucks in stock, both new and used. Summary:: The complete: #Person1#: We do a lot of camping in the mountains. What would you recommend for two people? #Person2#: You'd probably be better off with the four reel drive vehicle. We have several off-road trucks in stock, both new ...",1.76117,1.549376,-0.211794
9,"Summarize the following conversation: #Person1#: We want to order some of these products. #Person2#: Mr. Thomas, this is a standard size. #Person1#: Can you supply us right away? #Person2#: Yes, we have plenty on hand right now. #Person1#: Good, we'll take 500 suits. #Person2#: OK, thanks for the order. Summary:</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad...","Below conversation: #Person1#: We want to order some of these products. #Person2#: Mr. Thomas, this is a standard size. #Person1#: Can you supply us right away? #Person2#: Yes, we have plenty on hand right now. #Person1#: Good, we'll take 500 suits. #Person2#: OK, thanks for the order. Summary: Summary: We want to order some of these products. #Person2#: Mr. Thomas, this is a standard size. #Person1#: We want to order some of these products. #Person2#: Mr. Thomas, this is a standard size. #P...","Below: We want to order some of these products. #Person2#: Mr. Thomas, this is a standard size. #Person1#: Can you supply us right away? #Person2#: Yes, we have plenty on hand right now. #Person1#: Good, we'll take 500 suits. #Person2#: OK, thanks for the order. Summary: We want to order some of these products. #Person2#: Mr. Thomas, this is a standard size. #Person1#: Can you supply us right away? #Person2#: Yes, We have lots on hand right now. #Person1#: Good, we'll take 500 suits. #Person...",1.290689,1.052881,-0.237808
