In [None]:
##
# OpenAlex API Query + dataset creation
#

import pandas as pd
import requests
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def save_papers_to_csv(papers, filename):
    # Save results to CSV
    df = pd.DataFrame(papers, columns=["Title", "Abstract", "DOI"])
    df.to_csv(filename, index=False)

def decode_inverted_index(inverted_index):
    """Convert OpenAlex inverted index format to plain text.
    
    Args:
        inverted_index (dict): Dictionary with words as keys and position lists as values
        
    Returns:
        str: Reconstructed text with words in correct positions
    """
    if not inverted_index:
        return ""
    
    # Create a list of (position, word) tuples from the inverted index
    word_positions = []
    for word, positions in inverted_index.items():
        for pos in positions:
            word_positions.append((pos, word))
    
    # Sort by position and join words
    return " ".join(word for _, word in sorted(word_positions))

retry_strategy = Retry(
    total=5,  # maximum number of retries
    backoff_factor=30,  # will wait 30s, 60s, 120s between retries
    status_forcelist=[429, 500, 502, 503, 504]
)
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))

apiUrl = "https://api.openalex.org/works"

params = {
    "mailto": "k.p.dieleman@students.uu.nl",
    "search": '(delinquency OR delinquent OR criminal OR offender OR violence OR aggression) AND ("latent growth mixture" OR "latent class growth" OR LCGA OR LGMM)',
    "select": "title,abstract_inverted_index,doi",
    "per-page": 200,
    "page": 1,
    "filter": f'publication_year:>2014',
}

papers = []
page = 1
total_results = None

while True:
    try:
        params['page'] = page
        response = session.get(apiUrl, params=params)
        response.raise_for_status()
        data = response.json()
        
        if total_results is None:
            total_results = data.get("meta", {}).get("count", 0)
            print(f"Total available results: {total_results}")
        
        if "results" not in data or not data["results"]:
            print("No more results available")
            break
        
        current_batch = len(data["results"])
        for paper in data["results"]:
            title = paper.get("title")
            abstract_index = paper.get("abstract_inverted_index")
            abstract = decode_inverted_index(abstract_index) if abstract_index else ""
            doi = paper.get("doi")
            if abstract: papers.append([title, abstract, doi])

        # Save intermediate results after each batch
        save_papers_to_csv(papers, "openalex_results.csv")
        
        print(f"Fetched page {page} ({len(papers)} total papers so far)")
        
        if page * params['per-page'] >= total_results:
            print("Reached end of results")
            break
        
        page += 1
        time.sleep(3)
        
    except requests.exceptions.RequestException as e:
        print(f"Error occurred: {e}")
        if "429" in str(e):  # Rate limit error
            print("Rate limit hit, waiting longer...")
            time.sleep(120)
        else:
            time.sleep(60)
        continue