In [1]:
import pandas as pd 
import numpy as np
import torch
import torchdata
import transformers
import datasets 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print("torch versio", torch.__version__)
print("torchdata version", torchdata.__version__ )
print("transformers version", transformers.__version__ )
print("datasets version", datasets.__version__ )

torch versio 2.3.1+cpu
torchdata version 0.7.1
transformers version 4.42.4
datasets version 2.20.0


In [4]:
# Loading the dialogue summarzation dataset and evaluating the baseline performance
huggingface_dataset_name = "knkarthick/dialogsum"
dialogue_dataset = datasets.load_dataset(huggingface_dataset_name)

In [5]:
# checking data
from utilities import print_line, save_pickle, load_pickle
indexes = [7, 17]
for id in indexes:
    sample = dialogue_dataset['test'][id]
    print_line(100)
    print_line(100)
    print("Dialogue: \n",sample['dialogue'])
    print_line(20)
    print("Summary : \n", sample['summary'])


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Dialogue: 
 #Person1#: Kate, you never believe what's happened.
#Person2#: What do you mean?
#Person1#: Masha and Hero are getting divorced.
#Person2#: You are kidding. What happened?
#Person1#: Well, I don't really know, but I heard that they are having a separation for 2 months, and filed for divorce.
#Person2#: That's really surprising. I always thought they are well matched. What about the kids? Who get custody?
#Person1#: Masha, it seems quiet and makable, no quarrelling about who get the house and stock and then contesting the divorce with other details worked out.
#Person2#: That's the change from all the back stepping we usually hear about. Well, I still can't believe it, Masha and Hero, the perfect couple. When would they divorce be final?
#Person1#: Early in the New Year I gues

In [6]:
## creatig benchmakrs for in-context leaning and LoRa, IA3

model_name='google/flan-t5-base'
model_name = "google/flan-t5-large"

model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, use_fast=True)

### zero shot Inference

In [7]:


def make_prompt_template(dialogue):

    return f"""
Dialogue :
    {dialogue}
Summary: 

"""

dict_results_holder = {}

np.random.seed(7)
samples_ids = np.random.randint(100, 250, 10)
samples_ids = [int(item) for item in samples_ids]

for id in samples_ids:
    dict_results_holder[id] = {}
    example = dialogue_dataset['test'][id]
    dict_results_holder[id]['Human_summary'] = example['summary']

    tokenized_sentence = tokenizer(make_prompt_template(example['dialogue']), return_tensors='pt') # type: ignore

    sentence_decoded = tokenizer.decode( model.generate(tokenized_sentence['input_ids'], max_new_tokens=100)[0], skip_special_tokens= True)

    dict_results_holder[id]['zeroshot'] = sentence_decoded 

### one short

In [9]:


def make_icl_prompt_template(dialogue):
    
    onshot_sample = dialogue_dataset['test'][0] 
    return f"""
Dialogue :
    {onshot_sample['dialogue']}
Summary: 
    {onshot_sample['summary']}
    
Dialogue :
    {dialogue}
Summary: 

"""

for id in samples_ids:
    if id not in dict_results_holder.keys():
        dict_results_holder[id] = {}

    example = dialogue_dataset['test'][id]
    dict_results_holder[id]['Human_summary'] = example['summary']

    tokenized_sentence = tokenizer(make_icl_prompt_template(example['dialogue']), return_tensors='pt') # type: ignore

    sentence_decoded = tokenizer.decode( model.generate(tokenized_sentence['input_ids'], max_new_tokens=100)[0], skip_special_tokens= True)

    dict_results_holder[id]['one_shot'] = sentence_decoded 




Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


### Few short

In [10]:


def make_icl_few_shot_prompt_template(dialogue):
    
    prompt_str = ""
    fewshot_ids = [int(item) for item in np.random.randint(300, 500, 4)]
    for id in fewshot_ids:
        sample = dialogue_dataset['test'][id] 
        prompt_str += f"""
Dialogue :
    {sample['dialogue']}
Summary: 
    {sample['summary']}

"""
    prompt_str += f"""
Dialogue :
    {dialogue}
Summary: 
"""
    return prompt_str
        


for id in samples_ids:
    if id not in dict_results_holder.keys():
        dict_results_holder[id] = {}

    example = dialogue_dataset['test'][id]
    dict_results_holder[id]['Human_summary'] = example['summary']

    tokenized_sentence = tokenizer(make_icl_few_shot_prompt_template(example['dialogue']), return_tensors='pt') # type: ignore

    sentence_decoded = tokenizer.decode( model.generate(tokenized_sentence['input_ids'], max_new_tokens=100)[0], skip_special_tokens= True)

    dict_results_holder[id]['few_shot'] = sentence_decoded 

save_pickle(dict_results_holder, 'dict_results_holder.pickle' )


In [20]:
def get_rouge_scores(dict_results_holder):
    human_references = [dict_results_holder[key]['Human_summary'] for key in dict_results_holder.keys()]
    zeroshot = [dict_results_holder[key]['zeroshot'] for key in dict_results_holder.keys()]
    oneshot = [dict_results_holder[key]['one_shot'] for key in dict_results_holder.keys()]
    fewshot = [dict_results_holder[key]['few_shot'] for key in dict_results_holder.keys()]

    import evaluate
    rouge = evaluate.load('rouge')
    zeroshot_rouge = rouge.compute(
        predictions=zeroshot,
        references=human_references,
        use_aggregator=True,
        use_stemmer=True,
    )

    oneshot_rouge = rouge.compute(
        predictions=oneshot,
        references=human_references,
        use_aggregator=True,
        use_stemmer=True,
    )

    fewshot_rouge = rouge.compute(
        predictions=fewshot,
        references=human_references,
        use_aggregator=True,
        use_stemmer=True,
    )

    print( "zero_short : {} \n one_short : {} \n  few_short : {} \n".format(zeroshot_rouge, oneshot_rouge, fewshot_rouge))



#rouge.compute(predictions)

### RougeScore calculation

In [21]:
print("T5 BASE Rouge scores")
dict_results_holder = load_pickle('t5_base_ICL_dict_results_holder.pickle')
get_rouge_scores(dict_results_holder)
print_line(100)

print("T5 Large Rouge scores")
dict_results_holder = load_pickle('t5_large_ICL_dict_results_holder.pickle')
get_rouge_scores(dict_results_holder)
print_line(100)


T5 BASE Rouge scores
zero_short : {'rouge1': 0.2714693191931299, 'rouge2': 0.08819247759237706, 'rougeL': 0.23776439735098698, 'rougeLsum': 0.23957339051104468} 
 one_short : {'rouge1': 0.21037399988098748, 'rouge2': 0.05469353119567932, 'rougeL': 0.18559400317231328, 'rougeLsum': 0.18686672342728244} 
  few_short : {'rouge1': 0.25296282344297, 'rouge2': 0.07823249760548293, 'rougeL': 0.2276731230434858, 'rougeLsum': 0.22920094244492306} 

----------------------------------------------------------------------------------------------------
T5 Large Rouge scores
zero_short : {'rouge1': 0.3066088576230827, 'rouge2': 0.10692799150952427, 'rougeL': 0.25387967561041735, 'rougeLsum': 0.2528241077739247} 
 one_short : {'rouge1': 0.30304847492450127, 'rouge2': 0.10420114942528737, 'rougeL': 0.25381356374884556, 'rougeLsum': 0.2504161782905666} 
  few_short : {'rouge1': 0.29064035082369977, 'rouge2': 0.11957729468599035, 'rougeL': 0.2546831696546925, 'rougeLsum': 0.25404069809116836} 

---------

## Observations
* The results for t5-base model[220M], makes some sense, the smaller size models arent exactly great at ICL. Ideally the scores are supposed to improve with one shot and few shot ICL but across the scores, zero shot is the winner. 
* with t5 large model[770M], As expected there is slight improvement in >rouge1 scores because its a lrage model relatively

#