In [1]:
#filt hypothesis documents
import json
import torch 
from selfcheckgpt.modeling_selfcheck import SelfCheckNLI
import spacy
from transformers import AutoTokenizer, AutoModelForCausalLM
from numpy import *
from scipy.stats import entropy


nlp = spacy.load("autodl-tmp/en_core_web_sm")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
selfcheck_nli = SelfCheckNLI(device=device, nli_model='autodl-tmp/deberta-v3-large-mnli') # set device to 'cuda' if GPU is available

llm_model="autodl-tmp/LLM-Research/Meta-Llama-3-8B-Instruct"
tokenizer= AutoTokenizer.from_pretrained(llm_model)
model = AutoModelForCausalLM.from_pretrained(llm_model)
model = model.eval()
model = model.to(device)
 
with open('dl19/hypothesis_documents_dl19_8', 'r') as file:
    query_hypothesisDocuments_dl19 = json.load(file)
    
hypothesis_documents_only=[doc[2:] for doc in query_hypothesisDocuments_dl19]

NLI_score=0.85
fact_score=0.85

for g in range(len(hypothesis_documents_only)):
    filted_passages=[]
    hypothesis_documents=hypothesis_documents_only[g]
    for l in range(len(hypothesis_documents)):
        sents_probs={}
        passage=hypothesis_documents[l]
        passage =  '"' * 3 + passage + '"' * 3 
        passage=passage.replace("\n", " ").replace("\t", " ").strip()
        sentences = [sent for sent in nlp(passage).sents] # List[spacy.tokens.span.Span]
        sentences = [sent.text.strip() for sent in sentences if len(sent) > 3]

        hypothesis_documents_left=[x for i, x in enumerate(hypothesis_documents) if i != l]
                
        sent_scores_nli = selfcheck_nli.predict(
            sentences = sentences,                          # list of sentences
            sampled_passages = hypothesis_documents_left, # list of sampled passages
        )
        filted_sentences_NLI=[sentence for sentence,sent_score in zip(sentences, sent_scores_nli) if sent_score<NLI_score]
        filted_sentences_fact=[]
        for i in range(len(sentences)):
            input = tokenizer(sentences[i], return_tensors="pt").to(device)
            output1=model(input.input_ids,output_attentions=True)
            logits = output1.logits
            prob = torch.softmax(logits, dim=-1)[0]
            probcpu=prob.cpu().detach().numpy()
            entropies=entropy(prob.cpu().detach().numpy(), base=2,axis=-1)
            attentions=output1.attentions
            attentions = attentions[-1][0]
            mean_atten = torch.sum(attentions, dim=1)
            mean_atten = torch.mean(mean_atten, dim=0)
            for k in range(mean_atten.shape[0]):
                mean_atten[k] /= (mean_atten.shape[0] - k)
            mean_atten=mean_atten.cpu().detach().numpy()
            sent_entropyAtten=entropies[1:]@mean_atten[1:]/len(mean_atten[1:])
            sent_probs=[]
            for k in range(input.input_ids.size()[1]-1):
                sent_probs.append(probcpu[k+1,input.input_ids[0][k+1]].astype(float))
            sents_probs[sentences[i]]=sent_probs
            if sent_entropyAtten<fact_score:
                filted_sentences_fact.append(sentences[i])
        filted_sentences=list(set(filted_sentences_NLI) & set(filted_sentences_fact))
        num_tokens=0
        tokens_probs=0
        for p in range(len(filted_sentences)):
            num_tokens+=len(sents_probs[filted_sentences[p]])
            for q in range(len(sents_probs[filted_sentences[p]])):
                tokens_probs+=sents_probs[filted_sentences[p]][q]
        if num_tokens!=0:
            passage_prob=tokens_probs/num_tokens
        else:
            passage_prob=0
        filted_passage = ''.join(filted_sentences)
        filted_passages.append([filted_passage,passage_prob])
        query_hypothesisDocuments_dl19[g][2:]=filted_passages

print('Filted.')

  return self.fget.__get__(instance, owner)()


SelfCheck-NLI initialized to device cuda


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Filted.


In [2]:
#run BM25

from pyserini.search import FaissSearcher, LuceneSearcher
from pyserini.search.faiss import AutoQueryEncoder,AnceQueryEncoder
import numpy as np
from pyserini.search import get_topics, get_qrels
from tqdm import tqdm
import random

searcher = LuceneSearcher.from_prebuilt_index('msmarco-v1-passage')

with open('dl19-lucene-top1000-trec', 'w')  as f:
    for i in range(len(query_hypothesisDocuments_dl19)):
        qid=query_hypothesisDocuments_dl19[i][0]
        question=query_hypothesisDocuments_dl19[i][1]
        hits = searcher.search(question, k=1000)
        rank = 0
        for hit in hits:
            rank += 1
            f.write(f'{qid} Q0 {hit.docid} {rank} {hit.score} rank\n')

!python -m pyserini.eval.trec_eval -c -l 2 -m map dl19-passage dl19-lucene-top1000-trec
!python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl19-passage dl19-lucene-top1000-trec
!python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl19-passage dl19-lucene-top1000-trec

Oct 09, 2024 9:48:50 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-l', '2', '-m', 'map', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-lucene-top1000-trec']
Results:
map                   	all	0.3013


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-m', 'ndcg_cut.10', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-lucene-top1000-trec']
Results:
ndcg_cut_10           	all	0.5058


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-l', '2', '-m', 'recall.1000', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-lucene-top1000-trec']
Results:
recall_1000           	all	0.7501


In [3]:
#run BM25+GOLFer

ratio=0.75

searcher = LuceneSearcher.from_prebuilt_index('msmarco-v1-passage')

with open('dl19-lucene-top1000-trec_GOLFer', 'w')  as f:
    for i in range(len(query_hypothesisDocuments_dl19)):
        qid=query_hypothesisDocuments_dl19[i][0]
        query=query_hypothesisDocuments_dl19[i][1]+'.'
        coe=int(ratio*5*8)
        query=query*coe
        hypothesis_documents=[x[0] for x in query_hypothesisDocuments_dl19[i][2:]]
        hypothesis_documents=''.join(hypothesis_documents)
        hits = searcher.search(query+hypothesis_documents, k=1000)
        rank = 0
        for hit in hits:
            rank += 1
            f.write(f'{qid} Q0 {hit.docid} {rank} {hit.score} rank\n')

!python -m pyserini.eval.trec_eval -c -l 2 -m map dl19-passage dl19-lucene-top1000-trec_GOLFer
!python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl19-passage dl19-lucene-top1000-trec_GOLFer
!python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl19-passage dl19-lucene-top1000-trec_GOLFer

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-l', '2', '-m', 'map', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-lucene-top1000-trec_GOLFer']
Results:
map                   	all	0.4037


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-m', 'ndcg_cut.10', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-lucene-top1000-trec_GOLFer']
Results:
ndcg_cut_10           	all	0.6063


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-l', '2', '-m', 'recall.1000', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-lucene-top1000-trec_GOLFer']
Results:
recall_1000           	all	0.8241


In [4]:
#run ANCE
encoder = AnceQueryEncoder(encoder_dir='autodl-tmp/ance-msmarco-passage', pooling='mean')
searcher = FaissSearcher('autodl-tmp/msmarco-v1-passage.ance/', encoder)

topics = get_topics('dl19-passage')
qrels = get_qrels('dl19-passage')

with open('dl19-ance-top1000-trec', 'w')  as f:
    for qid in tqdm(topics):
        if qid in qrels:
            query = topics[qid]['title']
            hits = searcher.search(query, k=1000)
            rank = 0
            for hit in hits:
                rank += 1
                f.write(f'{qid} Q0 {hit.docid} {rank} {hit.score} rank\n')

!python -m pyserini.eval.trec_eval -c -l 2 -m map dl19-passage dl19-ance-top1000-trec
!python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl19-passage dl19-ance-top1000-trec
!python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl19-passage dl19-ance-top1000-trec

100%|██████████| 43/43 [01:33<00:00,  2.18s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-l', '2', '-m', 'map', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-ance-top1000-trec']
Results:
map                   	all	0.3710


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-m', 'ndcg_cut.10', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-ance-top1000-trec']
Results:
ndcg_cut_10           	all	0.6452


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-l', '2', '-m', 'recall.1000', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-ance-top1000-trec']
Results:
recall_1000           	all	0.7554


In [6]:
#run ANCE+GOLFer

ratio1=0.075
ratio2=0.25
max_tokens=128
coe=ratio1/128*max_tokens+ratio2


def encode_weight(query, hypothesis_documents_withweight,coe):
    coe_passage=(1-coe)/np.sum([[row[1]] for row in hypothesis_documents_withweight])
    prob=[[coe]]+[[row[1]*coe_passage] for row in hypothesis_documents_withweight]

    hypothesis_documents=[row[0] for row in hypothesis_documents_withweight]

    all_emb_c = []
    for hypothesis_document in [query]+hypothesis_documents:
        c=hypothesis_document
        c_emb = encoder.encode(c)
        all_emb_c.append(np.array(c_emb))
    all_emb_c = np.array(all_emb_c)
    weighted_emb_c = np.sum(prob*all_emb_c, axis=0)
    GOLFer_vector = weighted_emb_c.reshape((1, len(weighted_emb_c)))
    return GOLFer_vector
    
with open('dl19-ance-top1000-trec_GOLFer', 'w')  as f:
    for i in range(len(query_hypothesisDocuments_dl19)):
        qid=query_hypothesisDocuments_dl19[i][0]
        encodedByWeight=encode_weight(query_hypothesisDocuments_dl19[i][1],query_hypothesisDocuments_dl19[i][2:],coe)
        hits = searcher.search(encodedByWeight, k=1000)
        rank = 0
        for hit in hits:
            rank += 1
            f.write(f'{qid} Q0 {hit.docid} {rank} {hit.score} rank\n')

!python -m pyserini.eval.trec_eval -c -l 2 -m map dl19-passage dl19-ance-top1000-trec_GOLFer
!python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 dl19-passage dl19-ance-top1000-trec_GOLFer
!python -m pyserini.eval.trec_eval -c -l 2 -m recall.1000 dl19-passage dl19-ance-top1000-trec_GOLFer

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-l', '2', '-m', 'map', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-ance-top1000-trec_GOLFer']
Results:
map                   	all	0.4730


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-m', 'ndcg_cut.10', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-ance-top1000-trec_GOLFer']
Results:
ndcg_cut_10           	all	0.7120


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-l', '2', '-m', 'recall.1000', '/root/.cache/pyserini/topics-and-qrels/qrels.dl19-passage.txt', 'dl19-ance-top1000-trec_GOLFer']
Results:
recall_1000           	all	0.8049
