In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto
llm = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"
llm_name = llm.replace('/','_').replace('-','_')
model = AutoModelForCausalLM.from_pretrained(
    llm,
    torch_dtype=torch.float16,
  
)
model =model.to('cuda')

tokenizer = AutoTokenizer.from_pretrained(llm,use_fast=False)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# clone this repo to have data
!git clone https://github.com/DAMO-NLP-SG/LLM_summeval

fatal: destination path 'LLM_summeval' already exists and is not an empty directory.


In [8]:
#!pip install transformers tqdm pandas numpy
#!pip install sentencepiece

In [9]:
#!ls LLM_summeval/

In [10]:
key_mapping={0:'relevance',1:'consistency',2:'fluency',3:'coherence'}

In [11]:

# prompts from LLM_summeval repository https://github.com/DAMO-NLP-SG/LLM_summeval
def prepare_mcq_prompt2(aspect_id, summary, article=None, order = [0,1,2,3,4]):
    if aspect_id == 0:
        scores = 'A: The Summary is totally irrelevant to the Article. Score: One.\nB: The majority of the Summary is irrelevant to the Article. Score: Two.\nC: Some information in the Summary is relevant to the Article whereas some are not. Score: Three.\nD: The majority of the Summary is relevant to the Article. Score: Four.\nE: All information included in the Summary is relevant to the Article. Score: Five.\n\nYour Answer (enter 1 letter from A to E):'
        prompt = f'Choose an option from A to E in order to score the following Summary given the corresponding Article with respect to relevance from one to five, where one indicates "irrelevance", and five indicates "perfect relevance". Note that relevance measures the Summary\'s selection of important content from the Article, whether the Summary grasps the main message of the Article without being overwhelmed by unnecessary or less significant details.\n\nArticle: {article}\n\nSummary: {summary}\n\n'
        

    if aspect_id == 1:
        scores = 'A: The Summary is totally inconsistent with the Article. Score: One.\nB: The majority of the Summary is inconsistent with the Article. Score: Two.\nC: Some information in the Summary is consistent with the Article whereas some are not. Score: Three.\nD: The majority of the Summary is consistent with the Article. Score: Four.\nE: All information included in the Summary is consistent with the Article. Score: Five.\n\nYour Answer (enter 1 letter from A to E):'
        prompt = f'Choose an option from A to E in order to score the following Summary given the corresponding Article with respect to consistency from one to five, where one indicates "inconsistency" and five indicates "perfect consistency". Note that consistency measures the factual alignment between the Summary and the Article, whether the Summary is faithful to the Article without introducing contradictions or misleading representations.\n\nArticle: {article}\n\nSummary: {summary}\n\n'


    if aspect_id == 2: 
        scores = 'A: The Summary is totally disfluent. Score: One.\nB: The majority of the Summary is disfluent. Score: Two.\nC: Some sentences in the Summary are fluent whereas some are not. Score: Three.\nD: The majority of the Summary is fluent. Score: Four\nE: All sentences in the Summary are fluent. Score: Five.\n\nYour Answer (enter 1 letter from A to E):'
        prompt = f'Choose an option from A to E in order to score the following Summary given the corresponding Article with respect to fluency from one to five, where one indicates "disfluency" and five indicates "perfect fluency". Note that fluency measures the quality of individual sentences in the Summary, whether the Summary is well-written, grammatically correct, and readable on the sentence level.\n\nArticle: {article}\n\nSummary: {summary}\n\nA: The Summary is totally disfluent. Score: One.\nB: The majority of the Summary is disfluent. Score: Two.\nC: Some sentences in the Summary are fluent whereas some are not. Score: Three.\nD: The majority of the Summary is fluent. Score: Four\nE: All sentences in the Summary are fluent. Score: Five.\n\nYour Answer (enter 1 letter from A to E):'
    
    if aspect_id == 3:
        scores = 'A: The Summary is completely incoherent. Score: One.\nB: The Summary is mostly incoherent. Score: Two.\nC: The Summary is somewhat coherent. Score: Three.\nD: The Summary is mostly coherent. Score: Four.\nE: The Summary is completely coherent. Score: Five.\n\nYour Answer (enter 1 letter from A to E):'
        prompt = f'Choose an option from A to E in order to score the following Summary given the corresponding Article with respect to coherence from one to five, where one indicates "incoherence" and five indicates "perfect coherence". Note that coherence measures the collective quality of the Summary, whether the Summary presents information that flows smoothly and avoids abrupt transitions or disjoint statements.\n\nArticle: {article}\n\nSummary: {summary}\n\nA: The Summary is completely incoherent. Score: One.\nB: The Summary is mostly incoherent. Score: Two.\nC: The Summary is somewhat coherent. Score: Three.\nD: The Summary is mostly coherent. Score: Four.\nE: The Summary is completely coherent. Score: Five.\n\nYour Answer (enter 1 letter from A to E):'

    scores = scores.split('\n')
    scores = [scores[j] for j in order]
    return prompt + '\n'.join(scores)

In [12]:
import json
import pathlib

data=json.loads(
pathlib.Path('LLM_summeval/summeval.json').read_text())

In [13]:

keys = list(data.keys())

In [14]:
def denormalize_json(data):
    keys2 = ['src', 'ref_summs', 'sys_summs', 'ref_summ']
    article = data[keys[0]]['src']
    
    records = []
    for k,d in data.items():
        summaries = d['sys_summs']
        article = d['src']
    
        for k1, item in list(summaries.items()):
            record = {
            }
            record['article'] = article
            record['summary'] = item['sys_summ']
            record['key'] = k
            record['key2'] = k1
            for k in ['relevance', 'consistency', 'fluency', 'coherence']:
                record[k] = item['scores'][k]
                record['chatgpt_'+k] = item['scores']['chatgpt_'+k]
            records.append(record)
    

In [1]:
import torch
import re, numpy as np

value_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5}
TOP_N_TOKENS = 20
def process_prompt(text):
    messages = [      
        {"role": "user", "content": text}
    ]   

    text = tokenizer.apply_chat_template(
        messages, 
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    return model_inputs
def filter_token(tok):
    # remove punctuation
    tok = re.sub(r'[^\w\s]','',tok)
    return tok
def process_logits(model_out, T=10):

    logits = model_out.logits[:,-1,:].cpu().numpy()
    logits = logits/T
    top_args = logits.argsort()[0,-TOP_N_TOKENS:]

    labels = tokenizer.convert_ids_to_tokens(top_args)
    logits_chosen = np.exp(logits[0,top_args] - logits[0,top_args].max())
    logits_chosen = logits_chosen / logits_chosen.sum()

    #print(logits_chosen, labels)

    logits_chosen2 = [(filter_token(l),p) for (l,p) in list(zip(labels,logits_chosen)) if filter_token(l) in 'ABCDE' and filter_token(l) != '']
    #print(logits_chosen2)
    sum_logits = sum([p for l,p in logits_chosen2])
    logits_scaled = [(value_map.get(l),p/sum_logits) for l,p in logits_chosen2]

    argmax = max(logits_scaled, key=lambda x: x[1])[0]
    expected_value = sum([l*p for l,p in logits_scaled])
    return argmax, expected_value

In [None]:
import tqdm
import copy
import numpy as np
torch.no_grad()
order = [0,1,2,3,4]

for idx in [0,1,2,3]:
    idx_metric=idx
    for record in tqdm.tqdm(records[0:1600]):
        for ord_ in ['inorder','reverse','random']:
            ord_arr = {'inorder':order, 'reverse':order[::-1],'random':copy.copy(order)}[ord_]
            if ord_ == 'random':
                np.random.shuffle(ord_arr)
            txt = prepare_mcq_prompt2(idx, record['summary'], record['article'], ord_arr)
            model_inputs = process_prompt(txt)
            
            with torch.no_grad():
                model_out = model(model_inputs.input_ids)
                argmax, expected_value = process_logits(model_out, T=10)
                argmax2, expected_value2 = process_logits(model_out, T=1)
                record['argmax_'+ord_+'_' + key_mapping[idx_metric]] = argmax
                record['e(s)_10_' +ord_+'_' + key_mapping[idx_metric]] = expected_value
                record['e(s)_1_' +ord_+'_' + key_mapping[idx_metric]] = expected_value2
                del model_out

    import json
    with open(f'processed_data_{llm_name}_{idx}.json','w') as fp:
        json.dump(records,fp,indent=4)

100%|██████████| 1600/1600 [17:42<00:00,  1.51it/s]
100%|██████████| 1600/1600 [17:35<00:00,  1.52it/s]
 18%|█▊        | 288/1600 [03:37<15:34,  1.40it/s]

In [35]:
import json
with open(f'processed_data_{llm_name}.json','w') as fp:
    json.dump(records,fp)

expected_10_relevance
1.0    1600
Name: count, dtype: int64