# ACL Anthology Crawler
* https://github.com/srhthu/ACL-Anthology-Crawler/blob/main/crawl.py

In [1]:

from bs4 import BeautifulSoup
import json
import numpy as np
import requests
import os
from tqdm import tqdm
from typing import List
import urllib.parse

In [2]:
base_url = "https://aclanthology.org"

In [3]:
event_name = "emnlp-2024"
event_url = urllib.parse.urljoin(base = base_url, url = f"events/{event_name}")
print(event_url)

https://aclanthology.org/events/emnlp-2024


In [12]:
def get_conf_paper_list(soup, conf_id) -> list:
    papers = soup.find('div', id = conf_id).find_all('p', class_ = "d-sm-flex")
    paper_list = []
    for paper_p in papers:
        pdf_url = paper_p.contents[0].contents[0]['href']
        paper_span = paper_p.contents[-1]
        assert paper_span.name == 'span'
        paper_a = paper_span.strong.a
        title = paper_a.get_text()
        url = "https://aclanthology.org" + paper_a['href']
        paper_id = paper_a['href'].replace("/", "")
        # ID, title, abs_url, pdf_url
        paper_list.append([paper_id, title, url, pdf_url])
    return paper_list
    
def get_paper_list(event_url: str, conf_ids: List[str]):
    html_doc = requests.get(event_url).text
    soup = BeautifulSoup(html_doc, 'html.parser')
    paper_list = []
    for conf_id in conf_ids:
        conf_paper_list = get_conf_paper_list(soup, conf_id)
        paper_list.extend(conf_paper_list)
    return paper_list

In [13]:
conf_ids = [
    "2024emnlp-main",
    "2024emnlp-demo",
    "2024emnlp-industry",
]
paper_list = get_paper_list(event_url, conf_ids)

In [14]:
paper_list[0]

['2024.emnlp-main.0',
 'Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing',
 'https://aclanthology.org/2024.emnlp-main.0/',
 'https://aclanthology.org/2024.emnlp-main.0.pdf']

In [15]:
# Main: Num papers: 1269
print("Num papers: {}".format(len(paper_list)))

Num papers: 1444


In [19]:
## Dump
'''
[
  [
    "2024.emnlp-main.0",
    "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    "https://aclanthology.org/2024.emnlp-main.0/",
    "https://aclanthology.org/2024.emnlp-main.0.pdf"
  ],
  ...
]
'''
with open(f'{event_name}.json', 'w', encoding='utf8') as f:
    json.dump(paper_list, f, indent = 2, ensure_ascii= False)

In [None]:
## Getting Paper PDF File
# illegal_chr = r'\/:*?<>|'
# table = ''.maketrans('', '', illegal_chr)
# ## Sample
# paper_idx = 2
# paper = paper_list[paper_idx]

# r = requests.get(paper[3])
# # ex. 2.Multi-News+ Cost-efficient Dataset Cleansing via LLM-based Data Annotation.pdf
# n = '{}.{}.pdf'.format(paper_idx, paper[0].translate(table))
# with open(n, 'wb') as f:
#     f.write(r.content)

In [21]:
## Get Abstract
paper_abs_url = paper[2]
html_doc = requests.get(paper_abs_url).text
soup = BeautifulSoup(html_doc, 'html.parser')

In [22]:
# Locate the abstract content
# abstract_div = soup.find('div', class_='acl-abstract')
# print(abstract_div)
# abstract_text = abstract_div.get_text(strip=True) if abstract_div else "Abstract not found."

# # Print the extracted abstract
# print(abstract_text)
abstract_span = soup.select_one('div.acl-abstract span')
print(abstract_span)

abstract_text = abstract_span.get_text(strip=True) if abstract_span else "Abstract not found."

# Print the extracted abstract
print(abstract_text)

<span>The quality of the dataset is crucial for ensuring optimal performance and reliability of downstream task models. However, datasets often contain noisy data inadvertently included during the construction process. Numerous attempts have been made to correct this issue through human annotators. However, hiring and managing human annotators is expensive and time-consuming. As an alternative, recent studies are exploring the use of large language models (LLMs) for data annotation.In this study, we present a case study that extends the application of LLM-based data annotation to enhance the quality of existing datasets through a cleansing strategy. Specifically, we leverage approaches such as chain-of-thought and majority voting to imitate human annotation and classify unrelated documents from the Multi-News dataset, which is widely used for the multi-document summarization task. Through our proposed cleansing method, we introduce an enhanced Multi-News+. By employing LLMs for data cl