In [None]:
import requests
import pandas as pd
import time


def get_field_id(field_name):
    """
    Query OpenAlex API to find the concept ID corresponding to a given field name.
    This ID is used to filter papers by field.
    
    Args:
        field_name (str): Name of the research field (e.g., 'computer science').
    
    Returns:
        str: OpenAlex concept ID for the field.
    
    Raises:
        ValueError: If the field name is not found in OpenAlex concepts.
    """
    url = f"https://api.openalex.org/concepts?search={field_name}&per-page=1"
    r = requests.get(url)
    r.raise_for_status()
    results = r.json()['results']
    if results:
        return results[0]['id']
    else:
        raise ValueError(f"Field '{field_name}' not found.")

def invert_abstract_index(inverted_index):
    """
    Reconstruct plain-text abstract from OpenAlex inverted index format.
    The inverted index is a dict mapping words to positions.
    
    Args:
        inverted_index (dict): e.g. {'word': [positions]}
        
    Returns:
        str: The reconstructed abstract as plain text.
    """
    position_word_pairs = []
    # Build a list of (position, word) tuples for sorting
    for word, positions in inverted_index.items():
        for pos in positions:
            position_word_pairs.append((pos, word))
    # Sort by position to reconstruct original order
    position_word_pairs.sort(key=lambda x: x[0])
    words = [w for _, w in position_word_pairs]
    # Join all words with space to form the abstract text
    return " ".join(words)

def fetch_papers_up_to_year(field_id, max_year, min_year=1800, max_results=1000):
    """
    Fetch papers from OpenAlex API filtered by field ID and publication years descending
    from max_year down to min_year, stopping when max_results are reached.
    
    Uses cursor-based pagination and requests up to 200 papers per page.
    
    Args:
        field_id (str): OpenAlex concept ID for filtering.
        max_year (int): Latest publication year to include.
        min_year (int): Earliest publication year to include (default 1800).
        max_results (int): Maximum number of papers to fetch (default 1000).
        
    Returns:
        list: List of paper metadata dicts.
    """
    url = "https://api.openalex.org/works"
    papers = []
    count = 0

    # Loop over years descending: newest to oldest
    for year in range(max_year, min_year - 1, -1):
        if count >= max_results:
            break

        print(f"Fetching papers for year {year}... {count}/{max_results}")
        # Initial params: filter by field and year, 200 results per page, start cursor
        params = {
            "filter": f"concepts.id:{field_id},publication_year:{year}",
            "per-page": 200,
            "cursor": "*"
        }

        while count < max_results:
            r = requests.get(url, params=params)
            r.raise_for_status()
            data = r.json()
            results = data['results']
            if not results:
                # No more results for this year
                break

            papers.extend(results)
            count += len(results)
            if count >= max_results:
                # Reached maximum papers to fetch
                break

            # Pagination: get the next cursor to fetch next page
            next_cursor = data.get('meta', {}).get('next_cursor')
            if not next_cursor:
                # No more pages
                break

            params["cursor"] = next_cursor
            time.sleep(1)  # Rate limiting pause to avoid hitting API limits

    return papers[:max_results]

def extract_institution_and_country(authorship):
    """
    Extract the first institution's name and country code from an authorship record.
    
    Args:
        authorship (dict): Single authorship entry from OpenAlex paper data.
        
    Returns:
        tuple: (institution_name (str or None), country_code (str or None))
    """
    institution = None
    country = None
    if 'institutions' in authorship and authorship['institutions']:
        inst = authorship['institutions'][0]  # Take first institution only
        institution = inst.get('display_name')
        country = inst.get('country_code')
    return institution, country

def save_papers_to_csv(papers, filename):
    """
    Process raw paper data, filter papers without abstracts, bibliographies, or country data,
    reconstruct abstracts from inverted index if necessary, and save to CSV.
    
    Args:
        papers (list): List of paper metadata dicts.
        filename (str): Output CSV filename.
    """
    records = []
    for paper in papers:
        # Reconstruct abstract text if inverted index present, else use plain abstract field
        abstract_inverted = paper.get('abstract_inverted_index', None)
        abstract = invert_abstract_index(abstract_inverted) if abstract_inverted else paper.get('abstract')

        if not abstract or not abstract.strip():
            # Skip papers without abstract text
            continue

        # Extract bibliography references (list of paper IDs)
        referenced_works = paper.get('referenced_works', [])
        if not referenced_works:
            # Skip papers without bibliography
            continue

        # Extract authorship info: names, institutions, countries
        authors_list = []
        institutions_list = []
        countries_list = []
        for authorship in paper.get('authorships', []):
            author_name = authorship.get('author', {}).get('display_name') if 'author' in authorship else None
            institution, country = extract_institution_and_country(authorship)
            authors_list.append(author_name)
            institutions_list.append(institution)
            countries_list.append(country)

        # Filter out papers where all authors have no country info
        if all(c is None for c in countries_list):
            continue

        # Extract concepts (fields of study) as a semicolon-separated string
        concepts = [concept['display_name'] for concept in paper.get('concepts', [])]

        # Append processed record to list
        records.append({
            "id": paper.get('id'),
            "title": paper.get('title'),
            "doi": paper.get('doi'),
            "publication_year": paper.get('publication_year'),
            "cited_by_count": paper.get('cited_by_count'),
            "reference_count": paper.get('referenced_works_count'),
            "referenced_works": "; ".join(referenced_works),
            "abstract": abstract,
            "authors": "; ".join([a for a in authors_list if a]),
            "institutions": "; ".join([i for i in institutions_list if i]),
            "countries": "; ".join([c for c in countries_list if c]),
            # Updated per OpenAlex API changes to fetch journal info from primary_location.source
            "journal": paper.get('primary_location', {}).get('source', {}).get('display_name'),
            "journal_issn": paper.get('primary_location', {}).get('source', {}).get('issn_l'),
            "is_open_access": paper.get('open_access', {}).get('is_oa'),
            "is_retracted": paper.get('is_retracted'),
            "concepts": "; ".join(concepts)
        })

    # Convert list of dicts to pandas DataFrame
    df = pd.DataFrame(records)
    # Save DataFrame to CSV without index
    df.to_csv(filename, index=False)
    print(f"Saved {len(df)} papers with non-empty abstracts, bibliographies, and valid country data to {filename}")

if __name__ == "__main__":
    field_name = "computer science"  # Choose the research field
    max_year = 2025                  # Fetch papers up to this year
    min_year = 2025                  # Fetch papers starting from this year (only one year in this example)
    max_results = 10000              # Maximum number of papers to retrieve

    try:
        # Get OpenAlex field concept ID for filtering papers
        field_id = get_field_id(field_name)
        print(f"Field '{field_name}' found with ID: {field_id}")

        # Fetch papers metadata from OpenAlex API
        papers = fetch_papers_up_to_year(field_id, max_year=max_year, min_year=min_year, max_results=max_results)
        print(f"Fetched {len(papers)} papers (before filtering).")

        # Save filtered and processed papers to CSV file
        filename = f"{field_name.replace(' ', '_')}_up_to_{max_year}_with_bibliography.csv"
        save_papers_to_csv(papers, filename)

    except Exception as e:
        print(f"Error: {e}")