In [356]:
import json
import lucene
import numpy as np
import chinese_converter
import ir_measures
from ir_measures import *
from java.nio.file import Paths
from org.apache.lucene.analysis.fa import PersianAnalyzer
from org.apache.lucene.analysis.en import EnglishAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.search.similarities import BM25Similarity
from sentence_transformers import CrossEncoder
from sklearn.metrics import mean_squared_error 

In [357]:
# Initialize PyLucene
try:
    lucene.initVM(lucene.CLASSPATH, maxheap='3g')
except Exception as e:
    print(e)

JVM is already running, options are ineffective


In [358]:
with open('dict_docid_lang.txt') as f: 
    data = f.read() 
dict_docid_lang = json.loads(data)

In [359]:
Num_of_Retrieved_Docs = 1000
dict_clir_id_text = {}
dict_topic_lang_score_normal_rel = {}
dict_qrels_topic_id_doc_id = {}
dict_topic_id_topic_description_eng = {}
dict_user_query = {}
dict_id_english_text = {}

In [360]:
# Calculate Discounted Cumulative Gain (DCG)
def calculate_dcg(relevance_scores):
    return np.sum([((2 ** rel) - 1) / np.log2(rank + 2) for rank, rel in enumerate(relevance_scores)])

def min_max_normalization(scores):
    min_score = min(scores)
    max_score = max(scores)
    normalized_scores = [(score - min_score) / (max_score - min_score) for score in scores]
    return normalized_scores

In [361]:
def get_ratios_doclist(dict_docid_lang, doclist):
    count_fas = 0
    count_rus = 0
    count_zho = 0
    for docid in doclist: 
        if dict_docid_lang[docid] == 'fas': 
            count_fas = count_fas + 1
        elif dict_docid_lang[docid] == 'rus': 
            count_rus = count_rus + 1
        elif dict_docid_lang[docid] == 'zho': 
            count_zho = count_zho + 1
    total = count_fas + count_rus + count_zho
    return count_fas/total, count_rus/total, count_zho/total

In [362]:
def get_ratios_run_file(dict_docid_lang, run_file, k=0):
    ratios_fas = []
    ratios_rus = []
    ratios_zho = []
    dict_qid_doclist = {}
    with open(run_file) as f: 
        for line in f:
            data = line.split()
            qid = data[0]
            if qid not in dict_qid_doclist:
                dict_qid_doclist[qid] = []
            dict_qid_doclist[qid].append(data[2])
    for qid in dict_qid_doclist:
        doclist = dict_qid_doclist[qid]
        if k == 0:
            k = len(doclist)
        doclist = doclist[:k]
        r_fas, r_rus, r_zho = get_ratios_doclist(dict_docid_lang, doclist)
        ratios_fas.append(r_fas)
        ratios_rus.append(r_rus)
        ratios_zho.append(r_zho)
    return sum(ratios_fas)/len(ratios_fas), sum(ratios_rus)/len(ratios_rus), sum(ratios_zho)/len(ratios_zho)        

In [363]:
def alpha_dcg(relevance, ranking, alpha=0.5):
    dcg = 0.0
    topic_coverage = {}
    for rank, doc_id in enumerate(ranking):
        if doc_id not in relevance:
            continue        
        scores = relevance[doc_id]
        gain_sum = 0.0
        for topic_idx, score in enumerate(scores):
            if score > 0:
                if topic_idx not in topic_coverage:
                    topic_coverage[topic_idx] = 0
                gain = score * (1 - alpha) ** topic_coverage[topic_idx]
                gain_sum += gain
                topic_coverage[topic_idx] += 1
        dcg += gain_sum / np.log2(rank + 2) 
    return dcg

def ideal_alpha_dcg(relevance, alpha=0.5):
    sorted_docs = sorted(relevance.keys(), key=lambda doc_id: sum(relevance[doc_id]), reverse=True)
    return alpha_dcg(relevance, sorted_docs, alpha)

def alpha_ndcg(relevance, ranking, alpha=0.5):
    dcg = alpha_dcg(relevance, ranking, alpha)
    idcg = ideal_alpha_dcg(relevance, alpha)
    return dcg / idcg if idcg > 0 else 0.0

In [364]:
def get_doc_rel_lang(dict_docid_lang, dict_qrels_topic_id_doc_id, qid, docid):
    rel_fas_rus_zho = [0, 0, 0]
    rel = 0
    if docid in dict_qrels_topic_id_doc_id[qid]: 
        rel = dict_qrels_topic_id_doc_id[qid][docid]
    if dict_docid_lang[docid] == 'fas':
        rel_fas_rus_zho[0] = rel
    elif dict_docid_lang[docid] == 'rus':
        rel_fas_rus_zho[1] = rel
    elif dict_docid_lang[docid] == 'zho':
        rel_fas_rus_zho[2] = rel
    return rel_fas_rus_zho    

In [365]:
def get_alpha_ndcg_run_file(dict_docid_lang, dict_qrels_topic_id_doc_id, run_file, alpha, k=0):
    alpha_ndcgs_list = []
    dict_qid_doclist = {}
    with open(run_file) as f: 
        for line in f:
            data = line.split()
            qid = data[0]
            if qid not in dict_qid_doclist:
                dict_qid_doclist[qid] = []
            dict_qid_doclist[qid].append(data[2])
    for qid in dict_qid_doclist:
        doclist = dict_qid_doclist[qid]
        if k == 0:
            k = len(doclist)
        doclist = doclist[:k]
        relevance = {}
        for docid in doclist:
            relevance[docid] = get_doc_rel_lang(dict_docid_lang, dict_qrels_topic_id_doc_id, qid, docid)
        alpha_ndcgs_list.append(alpha_ndcg(relevance, doclist, alpha))
    return sum(alpha_ndcgs_list)/len(alpha_ndcgs_list)

In [366]:
topics = [t for t in map(json.loads, open("neuclir-2023-topics.0605.jsonl"))]
for t in topics:
    for topic in t['topics']: 
        if topic['lang'] == 'eng':
            dict_topic_id_topic_description_eng[t['topic_id']] = topic['topic_description']

In [367]:
with open("qrels.final", "r") as fp:
    lines = fp.readlines()
    for line in lines: 
        cols = line.strip().split()
        topic_id = cols[0]
        doc_id = cols[2]
        qrels = int(cols[3])
        if topic_id in dict_qrels_topic_id_doc_id: 
            dict_qrels_topic_id_doc_id[topic_id][doc_id] = qrels
        else: 
            dict_qrels_topic_id_doc_id[topic_id] = {}
            dict_qrels_topic_id_doc_id[topic_id][doc_id] = qrels                

In [368]:
dict_user_query = {}
for t in topics:
    user_query = {}
    for topic in t['topics']: 
        if topic['lang'] == 'eng': 
            user_query[topic['lang']] = topic['topic_title']
        else: 
            if topic['source'] == 'google translation':
                if topic['lang'] == 'zho':
                    user_query[topic['lang']] = chinese_converter.to_simplified(topic['topic_title'])
                    #user_query[topic['lang']] = topic['topic_title']
                else:
                    user_query[topic['lang']] = topic['topic_title']
    dict_user_query[t['topic_id']] = user_query

In [369]:
persian_analyzer = PersianAnalyzer()
english_analyzer = EnglishAnalyzer()
whitespace_analyzer = WhitespaceAnalyzer()

# Open the Lucene indexes
index_directory_fas = FSDirectory.open(Paths.get("/path/neuclir1/index_fas"))
index_directory_eng = FSDirectory.open(Paths.get("/path/neuclir1/index_eng"))
# Use IndexSearcher to search the index
index_searcher_fas = IndexSearcher(DirectoryReader.open(index_directory_fas))
index_searcher_fas.setSimilarity(BM25Similarity())
index_searcher_eng = IndexSearcher(DirectoryReader.open(index_directory_eng))
index_searcher_eng.setSimilarity(BM25Similarity())
# Create a QueryParser with the same analyzer used during indexing
query_parser_fas = QueryParser("titletext", persian_analyzer)
query_parser_eng = QueryParser("titletext", english_analyzer)
query_parser_id = QueryParser("id", whitespace_analyzer)

In [372]:
for key in dict_user_query: 
    query_topic_id = key
    user_query = dict_user_query[key]
    # Parse the user's query
    if 'fas' not in user_query:
        continue
    #print(key, user_query['fas'])
    parsed_query_fas = query_parser_fas.parse(user_query['fas'])    
    search_results_fas = index_searcher_fas.search(parsed_query_fas, Num_of_Retrieved_Docs)
    results_scores_fas = []
    results_scores_fas_id = []
    dict_id_score = {}
    # Process search results
    for score_doc in search_results_fas.scoreDocs:
        results_scores_fas.append(score_doc.score)
        doc_id = score_doc.doc
        #print(doc_id)
        doc = index_searcher_fas.doc(doc_id)
        id = doc.get("id")
        #title = doc.get("title")
        text = doc.get("titletext")
        #url = doc.get("url")
        dict_clir_id_text[id] = text
        results_scores_fas_id.append(id)
        dict_id_score[id] = score_doc.score
    ranked_list = results_scores_fas_id
    #print(ranked_list)
    relevance_scores = []
    for i in range(len(ranked_list)):
        doc_id = ranked_list[i]
        if doc_id in dict_qrels_topic_id_doc_id[query_topic_id]: 
            relevance_scores.append(dict_qrels_topic_id_doc_id[query_topic_id][doc_id])
        else: 
            relevance_scores.append(0)
    if key not in dict_topic_lang_score_normal_rel:
        dict_topic_lang_score_normal_rel[key] = {}
    dict_topic_lang_score_normal_rel[key]['fas'] = []
    dict_topic_lang_score_normal_rel[key]['fas'].append(list(zip(results_scores_fas, relevance_scores)))
    dict_topic_lang_score_normal_rel[key]['fas'].append(list(zip(min_max_normalization(results_scores_fas), relevance_scores)))
    dict_topic_lang_score_normal_rel[key]['fas'].append(ranked_list)
    dcg = calculate_dcg(relevance_scores)
    # Calculate Ideal Discounted Cumulative Gain (IDCG)
    ideal_relevance_scores = sorted(relevance_scores, reverse=True)
    idcg = calculate_dcg(ideal_relevance_scores)
    # Calculate NDCG score
    ndcg = dcg / idcg
    # print("Discounted Cumulative Gain (DCG):", dcg)
    # print("Ideal Discounted Cumulative Gain (IDCG):", idcg)
    # print("NDCG score:", ndcg)

  ndcg = dcg / idcg


In [377]:
with open('fas-teamli-1ANPS_run1', 'w') as f: 
    for key in dict_topic_lang_score_normal_rel:
        ranked_docs = dict_topic_lang_score_normal_rel[key]['fas'][2]
        for i in range(len(ranked_docs)):
            print(key, 'Q0', ranked_docs[i], i+1, dict_topic_lang_score_normal_rel[key]['fas'][1][i][0], 'fas-teamli-1ANPS_run1', file=f)

In [478]:
#qrels = ir_measures.read_trec_qrels('qrels.final.gains.fas')
qrels = ir_measures.read_trec_qrels('qrels.final.gains.fas')
run = ir_measures.read_trec_run('fas-teamli-1ANPS_run1')
ir_measures.calc_aggregate([nDCG@20, nDCG@50, R@20, R@50, R@100, R@1000], qrels, run)

{R@100: 0.4665427333962955,
 nDCG@20: 0.2744568858187121,
 R@50: 0.3869539562875874,
 R@1000: 0.6710477383378854,
 R@20: 0.2766463566668324,
 nDCG@50: 0.3097875923490143}

In [449]:
print(get_alpha_ndcg_run_file(dict_docid_lang, dict_qrels_topic_id_doc_id, 'fas-teamli-1ANPS_run1', 0.5, k=20))

0.5386605526118783


In [392]:
def get_translated_title_text_by_id(doc_id, index_searcher_eng, query_parser_id): 
    query_id = query_parser_id.parse(doc_id)
    #print(query_id)
    search_results = index_searcher_eng.search(query_id, 1)
    str_to_get = ''
    for score_doc in search_results.scoreDocs:
        tmp_id = score_doc.doc
        doc = index_searcher_eng.doc(tmp_id)
        str_to_get = str_to_get + doc.get("titletext") + ' '
    return str_to_get

In [393]:
dict_topic_title_english_doc_score1 = {}
#model1 = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512)
model1 = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2", max_length=512)
for topic_id in dict_topic_lang_score_normal_rel:
    doc_id_list = dict_topic_lang_score_normal_rel[topic_id]['fas'][2]
    #query = dict_user_query[topic_id]['eng']
    query = dict_topic_id_topic_description_eng[topic_id]
    #query = dict_user_query[topic_id]['eng']+ ' ' +dict_topic_id_topic_description_eng[topic_id]
    english_docs_list = []
    query_list = []
    for i in range(len(doc_id_list)):
        tmp_id = doc_id_list[i]
        english_docs_list.append(get_translated_title_text_by_id(tmp_id, index_searcher_eng, query_parser_id))
        query_list.append(query)
    scores = model1.predict(list(zip(query_list, english_docs_list)))
    dict_topic_title_english_doc_score1[topic_id] = sorted(zip(doc_id_list, scores), key = lambda x : x[1], reverse = True)

In [394]:
with open('clir-test_run_fas', 'w') as f: 
    for topic_id in dict_topic_title_english_doc_score1:
        ranked_docs_socres = dict_topic_title_english_doc_score1[topic_id]
        for i in range(len(ranked_docs_socres)):
            print(topic_id, 'Q0', ranked_docs_socres[i][0], i+1, ranked_docs_socres[i][1], 'clir-test_run_fas', file=f) 

In [395]:
qrels = ir_measures.read_trec_qrels('qrels.final.gains.fas')
run = ir_measures.read_trec_run('clir-test_run_fas')
ir_measures.calc_aggregate([nDCG@20, nDCG@50, R@20, R@50, R@100, R@1000], qrels, run)

{R@100: 0.5534226419746273,
 nDCG@20: 0.32340445680512164,
 R@50: 0.4496189095787948,
 R@1000: 0.7313655891545983,
 R@20: 0.34382297027372,
 nDCG@50: 0.35997865915194566}

In [396]:
print(get_ratios_run_file(dict_docid_lang, 'clir-test_run_fas', 1000))

(0.0, 0.0, 1.0)


In [491]:
dict_topic_title_english_doc_score2 = {}
model2 = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512)
#model2 = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2", max_length=512)
for topic_id in dict_topic_lang_score_normal_rel:
    doc_id_list = dict_topic_lang_score_normal_rel[topic_id]['fas'][2]
    #query = dict_user_query[topic_id]['eng']
    #query = dict_topic_id_topic_description_eng[topic_id]
    query = dict_user_query[topic_id]['eng']+ ' ' +dict_topic_id_topic_description_eng[topic_id]
    english_docs_list = []
    query_list = []
    for i in range(len(doc_id_list)):
        tmp_id = doc_id_list[i]
        english_docs_list.append(get_translated_title_text_by_id(tmp_id, index_searcher_eng, query_parser_id))
        query_list.append(query)
    scores = model2.predict(list(zip(query_list, english_docs_list)))
    dict_topic_title_english_doc_score2[topic_id] = sorted(zip(doc_id_list, scores), key = lambda x : x[1], reverse = True)

with open('clir-test_run_fas2', 'w') as f: 
    for topic_id in dict_topic_title_english_doc_score2:
        ranked_docs_socres = dict_topic_title_english_doc_score2[topic_id]
        for i in range(len(ranked_docs_socres)):
            print(topic_id, 'Q0', ranked_docs_socres[i][0], i+1, ranked_docs_socres[i][1], 'clir-test_run_fas2', file=f) 

In [492]:
qrels = ir_measures.read_trec_qrels('qrels.final.gains.fas')
run = ir_measures.read_trec_run('clir-test_run_fas2')
ir_measures.calc_aggregate([nDCG@20, nDCG@50, R@20, R@50, R@100, R@1000], qrels, run)

{R@100: 0.5618318693519773,
 nDCG@20: 0.40661098786265454,
 R@50: 0.49889639907559025,
 R@1000: 0.6710477383378854,
 R@20: 0.39183452537224955,
 nDCG@50: 0.43631357430903334}

In [494]:
print(get_alpha_ndcg_run_file(dict_docid_lang, dict_qrels_topic_id_doc_id, 'clir-test_run_fas2', 0.5, k=20))

0.6600738495919617


In [None]:
# Close the Lucene index readers
index_searcher_fas.getIndexReader().close()
index_searcher_eng.getIndexReader().close()

# Close the Lucene index directories
index_directory_fas.close()
index_directory_eng.close()