In [1]:
# specify the name of dataset: "hotpotqa_valid_original_split.csv", "gooaq_valid_original_split.csv", or "msmarco_valid_original_split.csv"
dataset_name= "msmarco_valid_original_split.csv"

path= "../../dataset/test/"+dataset_name

out_dir='../../output/Llama2-7B-results/pretrained'

# name of the fine-tuned model, the pre-trained model is named: "msmarco-original-split/", "hotpotqa-original-split/" or "gooaq-original-split/"
model_name= "TheBloke/Llama-2-7b-Chat-GPTQ"

### Configuration of Llama2-7b Model: 

In [2]:
from transformers import AutoTokenizer, pipeline, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import time

model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
model_basename = "model"

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
        model_basename=model_basename,
        use_safetensors=True,
        trust_remote_code=True,
        device="cuda:0",
        skip_special_tokens= True, 
        use_triton=use_triton,        
        quantize_config=None)

  from .autonotebook import tqdm as notebook_tqdm
skip module injection for FusedLlamaMLPForQuantizedModel not support integrate without triton yet.


In [3]:
# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
logging.set_verbosity(logging.CRITICAL)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",    
    do_sample=True,
    top_k=10,
    temperature=0.1,
    max_length=1000,
    max_new_tokens=500,
    repetition_penalty=1.1,  
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,)

In [4]:
def run_model(input_string):
    
    response=pipe(input_string)        
    answer= response[0]["generated_text"]
    
    answer= answer.replace(input_string, '')
    
    return answer

### Load the dataset: 

In [5]:
import pandas as pd
import os
import json
import math

In [6]:
# in hotpotQA and ms-macroQA datasets, we consider the context with the question: input_query= row.question_processed+'\n'+row.context_processed
# in GoogleQA dataset: the input_query only include the questions, since there is no context provided.


In [None]:
dataset_df= pd.read_csv(path)

dataset_df['generated_answer']=""


#iterate over the questions and answers: 
for row in dataset_df.itertuples():
    
    if row.context_processed=='':
        input_query= row.question_processed
            
    else:
        input_query= row.question_processed+'\n'+row.context_processed

    generated_answer=run_model(input_query)
    
    dataset_df.loc[row.Index,'generated_answer']= generated_answer

dataset_df.to_csv(out_dir+'/'+dataset_name)



### Evaluation using EM and F1 metrics: 

In [None]:
import sys
sys.path.append('../../src')

import measures

In [None]:
 os.chdir(out_dir)

def evaluate_unifiedqa(predictions, answers):
    
    result = {}
    result['checkpoint'] = model_name
    result['metrics'] = measures.all_metrics(predictions, answers)
    result['predictions'] = predictions

   
    filename=dataset_name.split('.')[0]
    filename=filename+".json"
    
    with open(filename, 'w+') as file:
        json.dump(result, file, indent=4)
        
    print ('results saved at ', out_dir+"/"+filename)

In [None]:
predictions= dataset_df['generated_answer'].tolist()
answers= dataset_df['answer_processed'].tolist()

answers= [[answer] for answer in answers]

evaluate_unifiedqa(predictions, answers)

In [None]:
print ('done')