In [24]:
import pandas as pd
from tqdm import tqdm
import re

# Function to parse .bib entries
def parse_bib_entry(entry):
    entry_dict = {}
    lines = entry.strip().split('\n')
    if lines and '{' in lines[0]:
        entry_dict['type'] = lines[0].split('{')[0].strip('@')
        key = lines[0].split('{')[1].strip(',').strip('}')
        entry_dict['key'] = key
        for line in lines[1:]:
            if '=' in line:
                k, v = line.split('=', 1)
                entry_dict[k.strip()] = v.strip().strip('{},')
    return entry_dict

# Function to convert page range to number of pages using regex
def page_count(pages):
    match = re.search(r'(\d+)\s*--\s*(\d+)', pages)
    if match:
        try:
            start = int(match.group(1))
            end = int(match.group(2))
            return end - start + 1
        except ValueError:
            return 0
    return 0

# Function to find year using regex from URL
def find_year_from_url(url):
    match = re.search(r'https://aclanthology.org/(\d{4})', url)
    if match:
        return match.group(1)
    return ''

# Read the .bib file
file_path = './anthology+abstracts.bib'
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# Split content into individual entries
entries = content.split('@')[1:]
data = [parse_bib_entry('@' + entry) for entry in entries if parse_bib_entry('@' + entry)]
df = pd.DataFrame(data)

# Fill NaN values with empty strings
df.fillna('', inplace=True)

# Define the search terms
#keywords = ["software engineering", "programming", "software development", "computer science", "computer engineering"]
education_terms = ["education", "teaching"]
model_terms = ["LLM", "large language model"]

# Function to check if an entry matches the search query
def matches_query(entry, keywords, education_terms, model_terms):
    title = entry.get('title', '').lower()
    abstract = entry.get('abstract', '').lower()
    #keywords_match = any(kw in title or kw in abstract for kw in keywords)
    education_match = any(term in title or term in abstract for term in education_terms)
    model_match = any(term in title or term in abstract for term in model_terms)
    return education_match and model_match

# Search for matching entries
matching_entries = []
for index, entry in tqdm(df.iterrows(), total=df.shape[0]):
    if matches_query(entry, keywords, education_terms, model_terms):
        matching_entries.append(entry)
    # Log some entries for manual check
    if index < 10:
        print(f"Title: {entry.get('title', '')}")
        print(f"Abstract: {entry.get('abstract', '')}")
        print(f"Matches query: {matches_query(entry, keywords, education_terms, model_terms)}")
        print(f"Pages: {entry.get('pages', '')} -> Page Count: {page_count(entry.get('pages', ''))}")
        print(f"Year from URL: {find_year_from_url(entry.get('url', ''))}")
        print("------")

# Extract required information
final_data = []
for entry in matching_entries:
    page_count_value = page_count(entry.get('pages', ''))
    if page_count_value == 0 and '--' in entry.get('pages', ''):
        print(f"Error in page count conversion for entry: {entry.get('pages', '')}")
    final_data.append({
        'title': entry.get('title', ''),
        'url': entry.get('url', ''),
        'doi': entry.get('doi', '') or entry.get('URL', ''),  # Attempt to extract DOI or URL
        'abstract': entry.get('abstract', ''),
        '# pages': page_count_value,
        'paper_type': entry.get('type', ''),
        'year': find_year_from_url(entry.get('url', '')),
        'bibtex': '@' + entry.get('type', '') + '{' + entry.get('key', '') + ',\n' + '\n'.join([f'    {k} = {{{v}}},' for k, v in entry.items() if k not in ['type', 'key']]) + '\n}'
    })

# Create a DataFrame from the final data
final_df = pd.DataFrame(final_data)

# Save the final results to a CSV file
csv_file_path = './ACL.csv'
final_df.to_csv(csv_file_path, index=False)

# Print the number of final papers
print(f"Number of final papers: {len(final_df)}")

# Print the first few matching entries
if len(final_df) > 0:
    print(final_df.head())


 10%|█         | 10062/98045 [00:00<00:02, 34246.83it/s]

Title: "Proceedings of the 8th Workshop on Online Abuse and Harms (WOAH 2024)"
Abstract: 
Matches query: False
Pages:  -> Page Count: 0
Year from URL: 2024
------
Title: "Investigating radicalisation indicators in online extremist communities"
Abstract: "We identify and analyse three sociolinguistic indicators of radicalisation within online extremist forums: hostility, longevity and social connectivity. We develop models to predict the maximum degree of each indicator measured over an individual{'}s lifetime, based on a minimal number of initial interactions. Drawing on data from two diverse extremist communities, our results demonstrate that NLP methods are effective at prioritising at-risk users. This work offers practical insights for intervention strategies and policy development, and highlights an important but under-studied research direction."
Matches query: False
Pages: "1--12" -> Page Count: 12
Year from URL: 2024
------
Title: "Detection of Conspiracy Theories Beyond Keyword

100%|██████████| 98045/98045 [00:01<00:00, 49706.96it/s]

Number of final papers: 79
                                               title  \
0  "Calibration-Tuning: Teaching Large Language M...   
1  "Cross-Task Defense: Instruction-Tuning {LLM}s...   
2  "{B}ad{R}ock at {S}em{E}val-2024 Task 8: {D}is...   
3  "Team Innovative at {S}em{E}val-2024 Task 8: M...   
4  "Mast Kalandar at {S}em{E}val-2024 Task 8: On ...   

                                               url doi  \
0  "https://aclanthology.org/2024.uncertainlp-1.1"       
1     "https://aclanthology.org/2024.trustnlp-1.9"       
2     "https://aclanthology.org/2024.semeval-1.37"       
3    "https://aclanthology.org/2024.semeval-1.171"       
4    "https://aclanthology.org/2024.semeval-1.231"       

                                            abstract  # pages     paper_type  \
0  "Large language models are increasingly deploy...       14  inproceedings   
1  "Recent studies reveal that Large Language Mod...        9  inproceedings   
2  "The rise of Large Language Models (LLMs) ha


