### Specify the dataset:

In [1]:
# specify the name of dataset: "hotpotqa_valid_original_split.csv", "gooaq_valid_original_split.csv", or "msmarco_valid_original_split.csv"
dataset_name= "msmarco_valid_original_split.csv"

path= "../../dataset/test/"+dataset_name

out_dir='../../output/Flan-T5-results/pretrained'

# name of the fine-tuned model, the pre-trained model is named: "msmarco-original-split/", "hotpotqa-original-split/" or "gooaq-original-split/"
model_name= "google/flan-t5-large"

### Configuration of Flan-T5 model:

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, set_seed
set_seed(42)

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto", load_in_8bit=True)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def run_model(input_string, **generator_args):
    
    input_ids = tokenizer(input_string, max_length=1000,truncation=True, return_tensors="pt").input_ids.to("cuda")
    
    res = model.generate(input_ids, max_new_tokens=200, **generator_args)
    
    return tokenizer.batch_decode(res, skip_special_tokens=True)

### Load the dataset:

In [4]:
import pandas as pd
import os
import json

In [None]:
dataset_df= pd.read_csv(path)

dataset_df['generated_answer']=""

#iterate over the questions and answers: 
for row in dataset_df.itertuples():
        
    # answer the question using the model
    if row.context_processed!='' : 
        input_query= row.question_processed+'\n'+row.context_processed
    else: 
        input_query= row.question_processed

    generated_answer=run_model(input_query)
    dataset_df.loc[row.Index,'generated_answer']= generated_answer[0]

dataset_df.to_csv(out_dir+'/'+dataset_name)

### Evaluation using EM and F1 metrics:

In [None]:
import sys
sys.path.append('../../src')

import measures

In [None]:
 os.chdir(out_dir)

def evaluate_unifiedqa(predictions, answers):
    
    result = {}
    result['checkpoint'] = model_name
    result['metrics'] = measures.all_metrics(predictions, answers)
    result['predictions'] = predictions

   
    filename=dataset_name.split('.')[0]
    filename=filename+".json"
    
    with open(filename, 'w+') as file:
        json.dump(result, file, indent=4)
        
    print ('results saved at ', out_dir+"/"+filename)

In [None]:
predictions= dataset_df['generated_answer'].tolist()
answers= dataset_df['answer_processed'].tolist()

answers= [[answer] for answer in answers]

evaluate_unifiedqa(predictions, answers)


In [9]:
print ('Done')

Done
