In [1]:
import re
from collections import OrderedDict
import pandas as pd

## Extract citations

In [2]:
md_files = [
    'secs/01-introduction.md',
    'secs/02-taxonomy.md'
]

In [3]:
combined_text = ""
for file in md_files:
    # Read the markdown files
    with open(file, 'r') as f:
        file_text = f.read()

    combined_text += file_text + "\n"

In [4]:
# Pattern to match author-year citations
patterns = [
    r'\([A-Z][^\)]*?\d{4}[^\)]*?\)',
    r'[A-Z][a-z]+(?:\s+(?:and|&)\s+[A-Z][a-z]+|\s+et al\.?)?\s+\(\d{4}\)',
]

raw_citations = []
for pattern in patterns:
    matches = re.findall(pattern, combined_text)
    raw_citations.extend(matches)

raw_citations[:3]

['(Licht and Sczepanski 2025; Farahani et al. 2024; )',
 '(Lipset and Rokkan 1967)',
 '(Dalton 1984, 2008; Kriesi et al. 2008, 2012; Piketty et al. 2018, 2021; Bornschier et al. 2024)']

In [5]:
# Split parenthetical citations by semicolons and extract individual citations
citations_map = {}
for citation in raw_citations:
    citation = citation.strip('')  # Remove parentheses
    
    # Check if it's a parenthetical citation
    if citation.startswith('(') and citation.endswith(')'):
        # Remove outer parentheses
        inner = citation[1:-1]

        # Split by semicolon
        parts = inner.split(';')
        
        for part in parts:
            part = part.strip()
            if not part:
                continue
            
            years = re.findall('\d{4}', part)
            authors = re.split("\s+(?=\d{4})", part)[0].strip()
            citations_map[part] = [f"{authors} ({y})" for y in years]
    else:
        # It's an inline citation - keep as-is
        citations_map[citation] = [citation]

In [6]:
unique_citations = list(OrderedDict.fromkeys(citation.replace(' & ', ' and ').replace(', (', ' (') for sublist in citations_map.values() for citation in sublist))
unique_citations[:10]

['Licht and Sczepanski (2025)',
 'Farahani et al. (2024)',
 'Lipset and Rokkan (1967)',
 'Dalton (1984)',
 'Dalton (2008)',
 'Kriesi et al. (2008)',
 'Kriesi et al. (2012)',
 'Piketty et al. (2018)',
 'Piketty et al. (2021)',
 'Bornschier et al. (2024)']

## Parse bib file

In [7]:
# Read the bib file
with open('references.bib', 'r') as f:
    bib_content = f.read()

# Extract all entries from bib file with their keys and authors/years
bib_pattern = r'@\w+\{([^,]+),.*?author\s*=\s*\{([^}]+)\}.*?\bdate\s*=\s*\{([^}]+)\}'
bib_entries = re.findall(bib_pattern, bib_content, re.DOTALL)

In [8]:
# Create a dictionary mapping (author_last_name, year) to bib key
# We'll store all possible combinations
bib_dict = {}
for key, author, date in bib_entries:
    # Extract year from date
    year_match = re.search(r'\d{4}', date)
    if year_match:
        year = year_match.group()
        
        # Extract last name(s) from author field
        # Handle "LastName, FirstName" and "FirstName LastName" formats
        # Also handle multiple authors with "and"
        authors_list = author.split(' and ')
        
        # Store first author
        if authors_list:
            first_auth = authors_list[0]
            if ',' in first_auth:
                last_name = first_auth.split(',')[0].strip()
            else:
                last_name = first_auth.split()[-1].strip()
            
            lookup_key = (last_name.lower(), year)
            if lookup_key not in bib_dict:
                bib_dict[lookup_key] = []
            bib_dict[lookup_key].append(key)

In [9]:
[(a, y) for a, y in bib_dict.keys() if 'mulder' in a]

[('de mulder', '2025')]

## map in-text citations to bib entries

In [10]:
def parse_first_author_year(citation):
    """Extract first author and year from a citation string."""
    # Remove outer parentheses if present
    citation = citation.strip()
    if citation.startswith('(') and citation.endswith(')'):
        citation = citation[1:-1]
    
    # Pattern to extract first author and year
    apat = r'^(([A-Z][a-z\u00C0-\u017F]+[ -]?){1,3})'
    author = re.search(apat, citation)
    
    ypat = r'\((\d{4})\)$'
    year = re.search(ypat, citation)
    return (
        author.group(1).strip().lower() if author else None,
        year.group(1) if year else None
    )


In [11]:
# example
cit = unique_citations[0]
print(cit)
key = parse_first_author_year(cit)
bib_dict[key] if key in bib_dict else None

Licht and Sczepanski (2025)


['licht_detecting_2025']

In [12]:
citation2bib_key = {}
for cit in unique_citations:
    key = parse_first_author_year(cit)
    citation2bib_key[cit] = bib_dict[key] if key in bib_dict else None

In [13]:
out = []

for citation, keys in citations_map.items():
    for key in keys:
        bib_keys = citation2bib_key.get(key, [])
        if bib_keys:
            for bib_key in bib_keys:
                out.append((citation, key, bib_key))
        else:
            out.append((citation, key, None))

In [14]:
out = pd.DataFrame(out, columns=['text', 'parsed', 'cite_key'])
out.to_csv('citation_mapping.tsv', sep='\t', index=False)