In [2]:
pip install pyalex

Collecting pyalex
  Downloading pyalex-0.13-py3-none-any.whl (10 kB)
Installing collected packages: pyalex
Successfully installed pyalex-0.13
Note: you may need to restart the kernel to use updated packages.


### OpenAlex entities
Works: Scholarly documents like journal articles, books, datasets, and theses

Authors: People who create works

Sources: Where works are hosted (such as journals, conferences, and repositories)

Institutions: Universities and other organizations to which authors claim affiliations

Topics: Topics assigned to works

Publishers: Companies and organizations that distribute works

Funders: Organizations that fund research

Geo: Where things are in the world

In [7]:
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders

In [27]:
entities = [Works, Authors, Sources, Institutions, Concepts, Publishers, Funders]

In [4]:
import pyalex

pyalex.config.email = "krr4de@virginia.edu"

In [10]:
import pandas as pd

In [16]:
list(Works().random().keys())

['id',
 'doi',
 'title',
 'display_name',
 'publication_year',
 'publication_date',
 'ids',
 'language',
 'primary_location',
 'type',
 'type_crossref',
 'indexed_in',
 'open_access',
 'authorships',
 'countries_distinct_count',
 'institutions_distinct_count',
 'corresponding_author_ids',
 'corresponding_institution_ids',
 'apc_list',
 'apc_paid',
 'has_fulltext',
 'cited_by_count',
 'cited_by_percentile_year',
 'biblio',
 'is_retracted',
 'is_paratext',
 'primary_topic',
 'topics',
 'keywords',
 'concepts',
 'mesh',
 'locations_count',
 'locations',
 'best_oa_location',
 'sustainable_development_goals',
 'grants',
 'referenced_works_count',
 'referenced_works',
 'related_works',
 'ngrams_url',
 'abstract_inverted_index',
 'cited_by_api_url',
 'counts_by_year',
 'updated_date',
 'created_date']

In [17]:
list(Authors().random().keys())

['id',
 'orcid',
 'display_name',
 'display_name_alternatives',
 'works_count',
 'cited_by_count',
 'summary_stats',
 'ids',
 'affiliations',
 'last_known_institution',
 'last_known_institutions',
 'x_concepts',
 'counts_by_year',
 'works_api_url',
 'updated_date',
 'created_date']

In [18]:
list(Sources().random().keys())

['id',
 'issn_l',
 'issn',
 'display_name',
 'host_organization',
 'host_organization_name',
 'host_organization_lineage',
 'works_count',
 'cited_by_count',
 'summary_stats',
 'is_oa',
 'is_in_doaj',
 'ids',
 'homepage_url',
 'apc_prices',
 'apc_usd',
 'country_code',
 'societies',
 'alternate_titles',
 'abbreviated_title',
 'type',
 'x_concepts',
 'counts_by_year',
 'works_api_url',
 'updated_date',
 'created_date']

In [19]:
list(Institutions().random().keys())

['id',
 'ror',
 'display_name',
 'country_code',
 'type',
 'lineage',
 'homepage_url',
 'image_url',
 'image_thumbnail_url',
 'display_name_acronyms',
 'display_name_alternatives',
 'repositories',
 'works_count',
 'cited_by_count',
 'summary_stats',
 'ids',
 'geo',
 'international',
 'associated_institutions',
 'counts_by_year',
 'roles',
 'x_concepts',
 'works_api_url',
 'updated_date',
 'created_date']

In [20]:
list(Concepts().random().keys())
'''
These are the original OpenAlex Concepts, which are being deprecated in favor of Topics. 
We will continue to provide these Concepts for Works, but we will not be actively maintaining, updating, or providing 
support for these concepts. Unless you have a good reason to be relying on them, we encourage you to look into Topics instead.
'''

['id',
 'wikidata',
 'display_name',
 'level',
 'description',
 'works_count',
 'cited_by_count',
 'summary_stats',
 'ids',
 'image_url',
 'image_thumbnail_url',
 'international',
 'ancestors',
 'related_concepts',
 'counts_by_year',
 'works_api_url',
 'updated_date',
 'created_date']

In [21]:
list(Publishers().random().keys())

['id',
 'display_name',
 'alternate_titles',
 'hierarchy_level',
 'parent_publisher',
 'lineage',
 'country_codes',
 'homepage_url',
 'image_url',
 'image_thumbnail_url',
 'works_count',
 'cited_by_count',
 'summary_stats',
 'ids',
 'counts_by_year',
 'roles',
 'sources_api_url',
 'updated_date',
 'created_date']

In [22]:
list(Funders().random().keys())

['id',
 'display_name',
 'alternate_titles',
 'country_code',
 'description',
 'homepage_url',
 'image_url',
 'image_thumbnail_url',
 'grants_count',
 'works_count',
 'cited_by_count',
 'summary_stats',
 'ids',
 'counts_by_year',
 'roles',
 'updated_date',
 'created_date']

In [25]:
Works().random()["abstract"]

'We present random quantum circuit models for non-unitary quantum dynamics of free fermions in one spatial dimension. Numerical simulations reveal that the dynamics tends towards steady states with logarithmic violations of the entanglement area law and power law correlation functions. Moreover, starting with a short-range entangled many-body state, the dynamical evolution of entanglement and correlations quantitatively agrees with the predictions of two-dimensional conformal field theory with a space-like time direction. We argue that this behavior is generic in non-unitary free quantum dynamics with time-dependent randomness, and show that the emergent conformal dynamics of two-point functions arises out of a simple "nonlinear master equation".'

In [30]:
entity_counts = []
for entity in entities:
    entity_counts.append(entity().count())

In [52]:
pd.DataFrame([entity.__name__ for entity in entities], entity_counts).reset_index().rename(columns={'index':'Count', 0:'Entity'}).sort_values('Count',ascending=False)

Unnamed: 0,Count,Entity
0,249179994,Works
1,90323601,Authors
2,251787,Sources
3,107447,Institutions
4,65073,Concepts
6,32437,Funders
5,10249,Publishers


In [86]:
works = Works().get(per_page=200)

### Training Pair 1: Title-Abstract

In [None]:
titles = [work['title'] for work in works]

In [89]:
abstracts = [work['abstract'] for work in works]

In [95]:
title_abs_pairs = [{"texts": [title, abstract]} for title, abstract in zip(titles, abstracts)]

In [101]:
title_abs_pairs[0:3]

[{'texts': ['PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT',
   'Since 1922 when Wu proposed the use of the Folin phenol reagent for the measurement of proteins (l), a number of modified analytical procedures ut.ilizing this reagent have been reported for the determination of proteins in serum (2-G), in antigen-antibody precipitates (7-9), and in insulin (10).Although the reagent would seem to be recommended by its great sensitivity and the simplicity of procedure possible with its use, it has not found great favor for general biochemical purposes.In the belief that this reagent, nevertheless, has considerable merit for certain application, but that its peculiarities and limitations need to be understood for its fullest exploitation, it has been studied with regard t.o effects of variations in pH, time of reaction, and concentration of reactants, permissible levels of reagents commonly used in handling proteins, and interfering subst.ances.Procedures are described for measuring pro

### Training Pair 2: Title-Keywords

In [126]:
keyword_dicts = [work['keywords'] for work in works]

In [137]:
keywords = [[keyword['keyword'] for keyword in sublist] for sublist in keyword_dicts]

In [129]:
title_keyword_pairs = [{"texts": [title, keywords]} for title, keywords in zip(titles, keywords)]

In [130]:
title_keyword_pairs[0:3]

[{'texts': ['PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT',
   ['folin phenol reagent', 'protein']]},
 {'texts': ['Cleavage of Structural Proteins during the Assembly of the Head of Bacteriophage T4',
   ['bacteriophage t4', 'structural proteins', 'head']]},
 {'texts': ['A rapid and sensitive method for the quantitation of microgram quantities of protein utilizing the principle of protein-dye binding',
   ['microgram quantities', 'protein-dye']]}]

### Training Pair 3: Title-Topics

In [135]:
topics_dicts = [work['topics'] for work in works]

In [138]:
topics = [[topic['display_name'] for topic in sublist] for sublist in topics_dicts]

In [139]:
title_topic_pairs = [{"texts": [title, topic]} for title, topics in zip(titles, topics)]

In [140]:
title_topic_pairs[0:3]

[{'texts': ['PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT',
   ['Glycosylation in Health and Disease',
    'Protein Metabolism in Exercise and Nutrition',
    'Oxidation States in Biochemistry and Medicine']]},
 {'texts': ['Cleavage of Structural Proteins during the Assembly of the Head of Bacteriophage T4',
   ['Ecology and Evolution of Viruses in Ecosystems',
    'RNA Sequencing Data Analysis',
    'Protein Structure Prediction and Analysis']]},
 {'texts': ['A rapid and sensitive method for the quantitation of microgram quantities of protein utilizing the principle of protein-dye binding',
   ['Factors Affecting Meat Quality and Preservation',
    'Microbial Interactions in Wine Production and Flavor',
    'Protein Metabolism in Exercise and Nutrition']]}]

### Training Pair 4: Title-Journal

In [169]:
journals = [work['primary_location']['source']['display_name'] if 'primary_location' in work and work['primary_location'] is not None and 'source' in work['primary_location'] and work['primary_location']['source'] is not None and 'display_name' in work['primary_location']['source'] else '' for work in works]

In [171]:
title_journal_pairs = [{"texts": [title, journal]} for title, journal in zip(titles, journals)]

In [172]:
title_journal_pairs[0:3]

[{'texts': ['PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT',
   'Journal of Biological Chemistry']},
 {'texts': ['Cleavage of Structural Proteins during the Assembly of the Head of Bacteriophage T4',
   'Nature']},
 {'texts': ['A rapid and sensitive method for the quantitation of microgram quantities of protein utilizing the principle of protein-dye binding',
   'Analytical Biochemistry']}]

### Training Pair 5: Abstract-Keywords

In [173]:
abs_keyword_pairs = [{"texts": [abstract, keywords]} for abstract, keywords in zip(abstracts, keywords)]

In [174]:
abs_keyword_pairs[0:3]

[{'texts': ['Since 1922 when Wu proposed the use of the Folin phenol reagent for the measurement of proteins (l), a number of modified analytical procedures ut.ilizing this reagent have been reported for the determination of proteins in serum (2-G), in antigen-antibody precipitates (7-9), and in insulin (10).Although the reagent would seem to be recommended by its great sensitivity and the simplicity of procedure possible with its use, it has not found great favor for general biochemical purposes.In the belief that this reagent, nevertheless, has considerable merit for certain application, but that its peculiarities and limitations need to be understood for its fullest exploitation, it has been studied with regard t.o effects of variations in pH, time of reaction, and concentration of reactants, permissible levels of reagents commonly used in handling proteins, and interfering subst.ances.Procedures are described for measuring protein in solution or after precipitation wit,h acids or o

### Training Pair 6: Abstract-Topics

In [175]:
abs_topic_pairs = [{"texts": [abstract, topics]} for abstract, topics in zip(abstracts, topics)]

In [176]:
abs_topic_pairs[0:3]

[{'texts': ['Since 1922 when Wu proposed the use of the Folin phenol reagent for the measurement of proteins (l), a number of modified analytical procedures ut.ilizing this reagent have been reported for the determination of proteins in serum (2-G), in antigen-antibody precipitates (7-9), and in insulin (10).Although the reagent would seem to be recommended by its great sensitivity and the simplicity of procedure possible with its use, it has not found great favor for general biochemical purposes.In the belief that this reagent, nevertheless, has considerable merit for certain application, but that its peculiarities and limitations need to be understood for its fullest exploitation, it has been studied with regard t.o effects of variations in pH, time of reaction, and concentration of reactants, permissible levels of reagents commonly used in handling proteins, and interfering subst.ances.Procedures are described for measuring protein in solution or after precipitation wit,h acids or o

### Training Pair 7: Abstract-Journal

In [177]:
abs_journal_pairs = [{"texts": [abstract, journal]} for abstract, journal in zip(abstracts, journals)]

In [178]:
abs_topic_pairs[0:3]

[{'texts': ['Since 1922 when Wu proposed the use of the Folin phenol reagent for the measurement of proteins (l), a number of modified analytical procedures ut.ilizing this reagent have been reported for the determination of proteins in serum (2-G), in antigen-antibody precipitates (7-9), and in insulin (10).Although the reagent would seem to be recommended by its great sensitivity and the simplicity of procedure possible with its use, it has not found great favor for general biochemical purposes.In the belief that this reagent, nevertheless, has considerable merit for certain application, but that its peculiarities and limitations need to be understood for its fullest exploitation, it has been studied with regard t.o effects of variations in pH, time of reaction, and concentration of reactants, permissible levels of reagents commonly used in handling proteins, and interfering subst.ances.Procedures are described for measuring protein in solution or after precipitation wit,h acids or o

### Training Pair 8: Title-Funder

In [None]:
### Not useful here as grants data is limited for works (only 4/200 articles have data)

In [215]:
grants = [work['grants'] for work in works]

In [216]:
grants

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [{'funder': 'https://openalex.org/F4320332161',
   'funder_display_name': 'National Institutes of Health',
   'award_id': None}],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [{'funder': 'https://openalex.org/F4320321040',
   'funder_display_name': 'National Science Council',
   'award_id': 'NSC 89-2213-E-002-013NSC 89-2213-E-002-106'}],
 [],
 [],
 [],
 [],
 []