### Specify the datasets: 

In [1]:
# specify the name of dataset: "hotpotqa_valid_original_split.csv", "gooaq_valid_original_split.csv", or "msmarco_valid_original_split.csv"
dataset_name= "msmarco_valid_original_split.csv"

path= "../../dataset/test/"+dataset_name

out_dir='../../output/Flan-XXL-results/COTs'

# name of the fine-tuned model, the pre-trained model is named: "msmarco-original-split/", "hotpotqa-original-split/" or "gooaq-original-split/"
model_name= "google/flan-t5-xxl"


In [2]:
# specify the chain-of-thoughts prompt: 
COTs_prompt= "Find the answer of the question from the context: \n"

### Configuration of FLAN-XXL model: 

In [3]:
# pip install bitsandbytes accelerate
from transformers import T5Tokenizer, T5ForConditionalGeneration, set_seed
set_seed(42)

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto", load_in_8bit=True)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Loading checkpoint shards: 100%|██████████| 5/5 [00:03<00:00,  1.30it/s]


In [4]:
def run_model(input_string, **generator_args):
    
    input_ids = tokenizer(input_string, max_length=1000,truncation=True, return_tensors="pt").input_ids.to("cuda")
    
    res = model.generate(input_ids, max_new_tokens=200, **generator_args)
    
    return tokenizer.batch_decode(res, skip_special_tokens=True)

### Load the dataset: 

In [5]:
import pandas as pd
import os
import json


In [6]:
dataset_df= pd.read_csv(path)

dataset_df['generated_answer']=""

#iterate over the questions and answers: 
for row in dataset_df.itertuples():
        
    # answer the question using the model
    if row.context_processed!='' : 
        input_query= row.question_processed+'\n'+COTs_prompt+row.context_processed
    else: 
        input_query= "Let's think step by step to answer the question: "+row.question_processed

    generated_answer=run_model(input_query)
    dataset_df.loc[row.Index,'generated_answer']= generated_answer[0]

dataset_df.to_csv(out_dir+'/'+dataset_name)

### Evaluation using EM and F1 metrics: 

In [7]:
import sys
sys.path.append('../../src')

import measures

In [8]:
 os.chdir(out_dir)

def evaluate_unifiedqa(predictions, answers):
    
    result = {}
    result['checkpoint'] = model_name
    result['metrics'] = measures.all_metrics(predictions, answers)
    result['predictions'] = predictions

   
    filename=dataset_name.split('.')[0]
    filename=filename+".json"
    
    with open(filename, 'w+') as file:
        json.dump(result, file, indent=4)
        
    print ('results saved at ', out_dir+"/"+filename)

In [9]:
predictions= dataset_df['generated_answer'].tolist()
answers= dataset_df['answer_processed'].tolist()

answers= [[answer] for answer in answers]

evaluate_unifiedqa(predictions, answers)

results saved at  ../../output/Flan-XXL-results/COTs/msmarco_valid_original_split.json


In [None]:
print ('done')