In [1]:
!pip install pyalex



In [2]:
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
import pandas as pd
import json

# Example of PyAlex data

## Example paper or "work"

In [3]:
example_paper = Works()["w4391166329"]
json.dumps(example_paper)

'{"id": "https://openalex.org/W4391166329", "doi": "https://doi.org/10.1371/journal.pone.0296013", "title": "Projected health and economic effects of the increase in childhood obesity during the COVID-19 pandemic in England: The potential cost of inaction", "display_name": "Projected health and economic effects of the increase in childhood obesity during the COVID-19 pandemic in England: The potential cost of inaction", "publication_year": 2024, "publication_date": "2024-01-24", "ids": {"openalex": "https://openalex.org/W4391166329", "doi": "https://doi.org/10.1371/journal.pone.0296013", "pmid": "https://pubmed.ncbi.nlm.nih.gov/38265978"}, "language": "en", "primary_location": {"is_oa": true, "landing_page_url": "https://doi.org/10.1371/journal.pone.0296013", "pdf_url": "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0296013&type=printable", "source": {"id": "https://openalex.org/S202381698", "display_name": "PLOS ONE", "issn_l": "1932-6203", "issn": ["1932-6203

In [4]:
example_paper["title"]

'Projected health and economic effects of the increase in childhood obesity during the COVID-19 pandemic in England: The potential cost of inaction'

In [5]:
example_paper["authorships"]

[{'author_position': 'first',
  'author': {'id': 'https://openalex.org/A5087116585',
   'display_name': 'Ivan Ochoa‐Moreno',
   'orcid': 'https://orcid.org/0000-0002-7843-9908'},
  'institutions': [{'id': 'https://openalex.org/I43439940',
    'display_name': 'University of Southampton',
    'ror': 'https://ror.org/01ryk1543',
    'country_code': 'GB',
    'type': 'education',
    'lineage': ['https://openalex.org/I43439940']},
   {'id': 'https://openalex.org/I52099693',
    'display_name': 'University of York',
    'ror': 'https://ror.org/04m01e293',
    'country_code': 'GB',
    'type': 'education',
    'lineage': ['https://openalex.org/I52099693']}],
  'countries': ['GB'],
  'is_corresponding': False,
  'raw_author_name': 'Iván Ochoa-Moreno',
  'raw_affiliation_string': 'Centre for Health Economics, University of York, Heslington, United Kingdom; School of Human Development and Health, University of Southampton, Southampton, United Kingdom',
  'raw_affiliation_strings': ['Centre for 

## Example journal or "source"

In [6]:
plos_one = Sources()['s202381698']
json.dumps(plos_one)

'{"id": "https://openalex.org/S202381698", "issn_l": "1932-6203", "issn": ["1932-6203"], "display_name": "PLOS ONE", "host_organization": "https://openalex.org/P4310315706", "host_organization_name": "Public Library of Science", "host_organization_lineage": ["https://openalex.org/P4310315706"], "works_count": 281207, "cited_by_count": 8599105, "summary_stats": {"2yr_mean_citedness": 3.168125227040058, "h_index": 437, "i10_index": 166654}, "is_oa": true, "is_in_doaj": true, "ids": {"openalex": "https://openalex.org/S202381698", "issn_l": "1932-6203", "issn": ["1932-6203"], "mag": "202381698", "wikidata": "https://www.wikidata.org/entity/Q564954", "fatcat": "https://fatcat.wiki/container/s3gm7274mfe6fcs7e3jterqlri"}, "homepage_url": "http://www.plosone.org/", "apc_prices": [{"price": 1805, "currency": "USD"}], "apc_usd": 1805, "country_code": "US", "societies": [], "alternate_titles": ["PLoS ONE", "Public Library of Science one", "PLoS 1"], "abbreviated_title": null, "type": "journal", "

## Example of filtering works and getting pages

In [7]:
total_count = Works().filter(
    primary_location={"source": {"id" : "s202381698"}},
    publication_year=">2023"
    ).count()

print(total_count)

3302


In [8]:
example_page = Works().filter(
    primary_location={"source": {"id" : "s202381698"}},
    publication_year=">2023"
    ).get()

In [9]:
example_page[0]

{'id': 'https://openalex.org/W4390496174',
 'doi': 'https://doi.org/10.1371/journal.pone.0287888',
 'title': 'Living on the edge: Crayfish as drivers to anoxification of their own shelter microenvironment',
 'display_name': 'Living on the edge: Crayfish as drivers to anoxification of their own shelter microenvironment',
 'publication_year': 2024,
 'publication_date': '2024-01-02',
 'ids': {'openalex': 'https://openalex.org/W4390496174',
  'doi': 'https://doi.org/10.1371/journal.pone.0287888',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/38165988'},
 'language': 'en',
 'primary_location': {'is_oa': True,
  'landing_page_url': 'https://doi.org/10.1371/journal.pone.0287888',
  'pdf_url': 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0287888&type=printable',
  'source': {'id': 'https://openalex.org/S202381698',
   'display_name': 'PLOS ONE',
   'issn_l': '1932-6203',
   'issn': ['1932-6203'],
   'is_oa': True,
   'is_in_doaj': True,
   'host_organization': 'https://

### Example of processing page into dataframe

In [10]:
pd.DataFrame(
    example_page,
    columns=[
        'id',
        'title',
        'authorships',
        'publication_year',
        'countries_distinct_count',
        'institutions_distinct_count',
        'referenced_works_count',
        'topics',
        'grants'
        ]
    )

Unnamed: 0,id,title,authorships,publication_year,countries_distinct_count,institutions_distinct_count,referenced_works_count,topics,grants
0,https://openalex.org/W4390496174,Living on the edge: Crayfish as drivers to ano...,"[{'author_position': 'first', 'author': {'id':...",2024,6,7,80,"[{'id': 'https://openalex.org/T11004', 'displa...",[{'funder': 'https://openalex.org/F4320323983'...
1,https://openalex.org/W4390605611,Genome-wide identification and comprehensive a...,"[{'author_position': 'first', 'author': {'id':...",2024,1,3,84,"[{'id': 'https://openalex.org/T11268', 'displa...",[]
2,https://openalex.org/W4390607113,Global adoption of 6-month drug-resistant TB r...,"[{'author_position': 'first', 'author': {'id':...",2024,14,10,6,"[{'id': 'https://openalex.org/T10038', 'displa...",[]
3,https://openalex.org/W4390693004,Novel ensemble learning approach with SVM-impu...,"[{'author_position': 'first', 'author': {'id':...",2024,1,1,52,"[{'id': 'https://openalex.org/T10146', 'displa...",[]
4,https://openalex.org/W4391166329,Projected health and economic effects of the i...,"[{'author_position': 'first', 'author': {'id':...",2024,2,7,28,"[{'id': 'https://openalex.org/T10010', 'displa...",[{'funder': 'https://openalex.org/F4320319990'...
5,https://openalex.org/W4390495602,Role of Influenza A virus protein NS1 in regul...,"[{'author_position': 'first', 'author': {'id':...",2024,2,4,43,"[{'id': 'https://openalex.org/T10167', 'displa...",[]
6,https://openalex.org/W4390495709,Feasibility and metabolic outcomes of a well-f...,"[{'author_position': 'first', 'author': {'id':...",2024,2,5,59,"[{'id': 'https://openalex.org/T12267', 'displa...",[{'funder': 'https://openalex.org/F4320313420'...
7,https://openalex.org/W4390496059,Dynamic asymmetric spillovers and connectednes...,"[{'author_position': 'first', 'author': {'id':...",2024,1,1,54,"[{'id': 'https://openalex.org/T11059', 'displa...",[]
8,https://openalex.org/W4390496092,Comorbidities in heart failure patients that p...,"[{'author_position': 'first', 'author': {'id':...",2024,1,3,31,"[{'id': 'https://openalex.org/T10198', 'displa...",[]
9,https://openalex.org/W4390496121,Characterization of a novel bacteriophage endo...,"[{'author_position': 'first', 'author': {'id':...",2024,1,2,43,"[{'id': 'https://openalex.org/T11048', 'displa...",[{'funder': 'https://openalex.org/F4320322817'...


## Using the pagenation feature to get all records

In [11]:
pager = Works().filter(
    primary_location={"source": {"id" : "s202381698"}},
    publication_year=">2023"
    ).paginate(per_page=200)

In [12]:
specified_columns = [
    'id',
    'title',
    'publication_year',
    'countries_distinct_count',
    'institutions_distinct_count',
    'referenced_works_count',
    'is_retracted'
]

max_limit_pages = 2 # Set to -1 for all pages

output_df = pd.DataFrame(columns=specified_columns)
output_df['is_retracted'] = output_df['is_retracted'].astype(bool)

page_count = 0
for page in pager:
  print(f"Page: {page_count}")
  output_df = pd.concat([
    output_df,
    pd.DataFrame(
      page,
      columns=specified_columns
    )
  ])
  page_count += 1
  if page_count == max_limit_pages:
    break

print(len(output_df))

Page: 0
Page: 1
400


In [13]:
output_df.reset_index(inplace=True)
output_df

Unnamed: 0,index,id,title,publication_year,countries_distinct_count,institutions_distinct_count,referenced_works_count,is_retracted
0,0,https://openalex.org/W4390496174,Living on the edge: Crayfish as drivers to ano...,2024,6,7,80,False
1,1,https://openalex.org/W4390605611,Genome-wide identification and comprehensive a...,2024,1,3,84,False
2,2,https://openalex.org/W4390607113,Global adoption of 6-month drug-resistant TB r...,2024,14,10,6,False
3,3,https://openalex.org/W4390693004,Novel ensemble learning approach with SVM-impu...,2024,1,1,52,False
4,4,https://openalex.org/W4391166329,Projected health and economic effects of the i...,2024,2,7,28,False
...,...,...,...,...,...,...,...,...
395,195,https://openalex.org/W4390692940,Factors associated with an increased risk of d...,2024,1,2,119,False
396,196,https://openalex.org/W4390693026,Reallocating time between device-measured 24-h...,2024,1,4,42,False
397,197,https://openalex.org/W4390693038,Impact of exercise training associated with en...,2024,1,4,54,False
398,198,https://openalex.org/W4390693060,Individual and environmental variables related...,2024,1,4,93,False


In [14]:
do_save = False

if do_save:
  output_df.to_pickle('example-openalex-data-01.pkl')
  print('Saved')