In [1]:
import pandas as pd
import networkx as nx
import numpy as np
from pyalex import Works, Authors
import requests

In [2]:
filename_posgraduacao_alimentos = 'data/works_posgraduacao_alimentos.csv'
df_posgraduacao_alimentos = pd.read_csv(filename_posgraduacao_alimentos, low_memory=False)

In [15]:
authors_ids_posgraduacao_alimentos = set()
for _, row in df_posgraduacao_alimentos.iterrows():
        ids = row['authorships.author.id'].split('|')
        authors_ids_posgraduacao_alimentos.update(ids)

In [16]:
original_url = 'https://api.openalex.org/authors?filter=id:{}&select=id,display_name,orcid,affiliations,cited_by_count,works_count&per-page=200&page={}'

def chunk_ids(ids_list, chunk_size=40):
    for i in range(0, len(ids_list), chunk_size):
        yield ids_list[i:i + chunk_size]

authors_data_posgraduacao_alimentos = []

for ids_chunk in chunk_ids(list(authors_ids_posgraduacao_alimentos)):
    page = 1
    ids = '|'.join(ids_chunk)
    has_more_pages = True
    fewer_than_10k_results = True
    
    while has_more_pages and fewer_than_10k_results:
        
        url = original_url.format(ids, page)
        page_with_results = requests.get(url).json()
        
        results = page_with_results['results']
        for author in results:
            if author['affiliations']:
                recent_affiliation = max(author['affiliations'], key=lambda x: max(x['years']))
                institution = recent_affiliation['institution']
                recent_year = max(recent_affiliation['years'])
            else:
                institution = {'display_name': None, 'country_code': None, 'type': None}
                recent_year = None
            
            authors_data_posgraduacao_alimentos.append({
                'id': author['id'],
                'display_name': author['display_name'],
                'orcid': author.get('orcid'),
                'institution': institution['display_name'],
                'country_code': institution['country_code'],
                'institution_type': institution['type'],
                'affiliation_year': recent_year,
                'cited_by_count': author['cited_by_count'],
                'works_count': author['works_count']
            })
    
        page += 1
        per_page = page_with_results['meta']['per_page']
        has_more_pages = len(results) == per_page
        fewer_than_10k_results = per_page * page <= 10000

df_authors_posgraduacao_alimentos = pd.DataFrame(authors_data_posgraduacao_alimentos)

In [17]:
df_authors_posgraduacao_alimentos.to_csv('data/authors_posgraduacao_alimentos.csv', index=False)