In [2]:
import os
import json
from tqdm import tqdm

In [3]:
from collections import Counter

def word_overlap_match(query, candidates, threshold = 90):
    def calculate_overlap(query_words, candidate_words):
        query_count = Counter(query_words)
        candidate_count = Counter(candidate_words)
        overlap_count = sum((query_count & candidate_count).values())
        return overlap_count / max(len(query_words), len(candidate_words)) * 100

    query_words = query.split()
    for index, candidate in enumerate(candidates):
        candidate_words = candidate.split()
        overlap_percentage = calculate_overlap(query_words, candidate_words)
        if overlap_percentage > threshold:
            return index
    return -1  # Return -1 if no candidate meets the threshold

In [4]:
event_name = "emnlp-2024"
conf_id = "2024emnlp-main"
conf_id = "2024emnlp-demo"
conf_id = "2024emnlp-industry"
conf_id = "2024emnlp-tutorials"

In [5]:
with open(f'{event_name}.json', 'r', encoding='utf8') as f:
    paper_list = json.load(f)
print("Num papers: {}".format(len(paper_list)))

Num papers: 1444


In [6]:
paper_titles = [x[1] for x in paper_list]

In [7]:
query_name = "query1"
with open(f"queries/{query_name}.txt", "r") as f:
    queries = [x.strip() for x in f.readlines()]
print(len(queries))

29


In [8]:
import requests
from bs4 import BeautifulSoup
def get_abstract_text(abs_url: str) -> str:
    html_doc = requests.get(abs_url).text
    soup = BeautifulSoup(html_doc, 'html.parser')
    abstract_span = soup.select_one('div.acl-abstract span')
    abstract_text = abstract_span.get_text(strip=True) if abstract_span else "Abstract not found."
    return abstract_text

In [10]:
search_results = []
found_count = 0
for query in tqdm(queries):
    idx = word_overlap_match(query, paper_titles, threshold=80)
    if idx==-1:
        continue
    found_count+=1
    # print(idx, query)
    search_result = {
        "query": query,
        "id": paper_list[idx][0],
        "title": paper_list[idx][1],
        "abstract_url": paper_list[idx][2],
        "paper_url": paper_list[idx][3],
        "abstract": get_abstract_text(paper_list[idx][2])
    }
    search_results.append(search_result)
print(len(search_results))

100%|██████████| 29/29 [00:10<00:00,  2.65it/s]

17





In [11]:
for result in search_results:
    print(json.dumps(result, indent = 4))

{
    "query": "Dense X Retrieval: What Retrieval Granularity Should We Use?",
    "id": "2024.emnlp-main.845",
    "title": "Dense X Retrieval: What Retrieval Granularity Should We Use?",
    "abstract_url": "https://aclanthology.org/2024.emnlp-main.845/",
    "paper_url": "https://aclanthology.org/2024.emnlp-main.845.pdf",
    "abstract": "Dense retrieval has become a prominent method to obtain relevant context or world knowledge in open-domain NLP tasks. When we use a learned dense retriever on a retrieval corpus at inference time, an often-overlooked design choice is the retrieval unit in which the corpus is indexed, e.g. document, passage, or sentence. We discover that the retrieval unit choice significantly impacts the performance of both retrieval and downstream tasks. Distinct from the typical approach of using passages or sentences, we introduce a novel retrieval unit, proposition, for dense retrieval. Propositions are defined as atomic expressions within text, each encapsulat

In [12]:
with open(f"results/{query_name}_search_results.json", "w") as f:
    f.write(json.dumps(search_results, indent = "\t"))