# CrossRef API

We are using the [habanero](https://github.com/sckott/habanero) python implementation of the [Crossref API](https://api.crossref.org/swagger-ui/index.html)

In [11]:
import pandas as pd
from habanero import Crossref

In [30]:
cr = Crossref()

In [31]:
# Works API, get all the papers (20 per page)
all_works = cr.works()

In [33]:
# Print number of works:
if all_works["status"] == "ok":
    print(all_works["message"]["total-results"])

157136227


In [17]:
# We can use the following filters:
cr.filter_names()

['affiliation',
 'alternative_id',
 'archive',
 'article_number',
 'assertion',
 'assertion_group',
 'award_funder',
 'award_number',
 'category_name',
 'container_title',
 'content_domain',
 'directory',
 'doi',
 'from_accepted_date',
 'from_created_date',
 'from_deposit_date',
 'from_index_date',
 'from_online_pub_date',
 'from_posted_date',
 'from_print_pub_date',
 'from_pub_date',
 'from_update_date',
 'full_text_application',
 'full_text_type',
 'full_text_version',
 'funder',
 'has_abstract',
 'has_affiliation',
 'has_archive',
 'has_assertion',
 'has_authenticated_orcid',
 'has_clinical_trial_number',
 'has_content_domain',
 'has_crossmark_restriction',
 'has_full_text',
 'has_funder',
 'has_license',
 'has_orcid',
 'has_references',
 'has_relation',
 'has_update_policy',
 'is_update',
 'issn',
 'license_delay',
 'license_url',
 'license_version',
 'location',
 'member',
 'orcid',
 'prefix',
 'public_references',
 'publisher_name',
 'relation_object',
 'relation_object_type',
 '

In [25]:
# Filtering out papers without abstract metadata. Other papers also have abstract but was not ingested into the metadata
works_with_abstracts = cr.works(filter = {'has_abstract': True})

In [29]:
# Print number of works with abstract:
if works_with_abstracts["status"] == "ok":
    print(works_with_abstracts["message"]["total-results"])

30432983


In [34]:
# Abstract and full-text
works_with_abstracts_and_full_text = cr.works(filter = {'has_abstract': True, 'has_full_text': True})

In [35]:
works_with_abstracts_and_full_text["message"]["total-results"]

26556010

In [66]:
# Query search is not fulltext, only title and authors and other ingested metadata is used
#material_science_works = cr.works(Query="Materials Science", filter = {'has_abstract': True})
# use category_name instead, but this requires a exact match:
material_science_works = cr.works(filter = {'has_abstract': True, 'category_name': 'General Materials Science'})
# Print number of materials science works:
if material_science_works["status"] == "ok":
    print(material_science_works["message"]["total-results"])

736679


In [68]:
# Some info
for work in material_science_works['message']['items']:
    print("#"*30)
    print(work['title'][0])
    print(work['type'])
    print(work['abstract'])
    if ('link' in work):
        last_line = "";
        for link in work['link']:
            if (last_line != link['URL']):
                last_line = link['URL']
        print(last_line)
    if ('subject' in work):
        print(work['subject'])

##############################
Comparative Study of Calculation Methods for Shield Thickness of Typical Materials against Gamma Rays
journal-article
<jats:p>The paper aims to analyze the shielding properties of concrete and lead materials against gamma rays at different energies, and the relationships between the shield thickness of the two materials and gamma ray energy and attenuation factor have been obtained by using the method of attenuation multiple and the method of half-value-thickness, respectively. The results show that when the gamma ray energy and the attenuation factor are determined, the thickness of the concrete shield layer obtained by the method of attenuation multiple is greater than that obtained by the half-value-thickness method. The relative magnitude of thickness of lead shield obtained by the method of attenuation multiple and the method of half-value-thickness method is relate to the gamma ray energy. When the gamma ray energy is lower than 8 MeV, the thickness

In [71]:
# looking for category_name related to 'General Materials Science'
# Use `limit` and `offset` to get more resutls.
# Setting has_full_text to true is not useful if you do not have access to the papers.
page = cr.works(filter = {'has_abstract': True, 'has_full_text': True, 'category_name': 'General Materials Science'}, limit=20, offset=20)
for work in page['message']['items']:
    if ('subject' in work):
        print(work['subject'])
        print(work['score'])

['General Materials Science']
0.0
['Mechanical Engineering', 'Mechanics of Materials', 'Condensed Matter Physics', 'General Materials Science']
0.0
['Mechanical Engineering', 'Mechanics of Materials', 'General Materials Science']
0.0
['Mechanical Engineering', 'Mechanics of Materials', 'Condensed Matter Physics', 'General Materials Science']
0.0
['Mechanical Engineering', 'Mechanics of Materials', 'General Materials Science']
0.0
['Marketing', 'Strategy and Management', 'General Materials Science', 'Media Technology']
0.0
['Mechanical Engineering', 'Mechanics of Materials', 'Condensed Matter Physics', 'General Materials Science']
0.0
['General Materials Science']
0.0
['General Materials Science']
0.0
['Mechanical Engineering', 'Mechanics of Materials', 'General Materials Science']
0.0
['Condensed Matter Physics', 'General Materials Science']
0.0
['General Materials Science']
0.0
['Developmental Biology', 'Endocrinology', 'Genetics', 'General Materials Science', 'Molecular Biology', 'An

In [70]:
cr.works(ids = '10.2991/MMME-16.2016.184')

{'status': 'ok',
 'message-type': 'work',
 'message-version': '1.0.0',
 'message': {'indexed': {'date-parts': [[2022, 4, 5]],
   'date-time': '2022-04-05T22:30:16Z',
   'timestamp': 1649197816235},
  'publisher-location': 'Paris, France',
  'reference-count': 0,
  'publisher': 'Atlantis Press',
  'content-domain': {'domain': [], 'crossmark-restriction': False},
  'short-container-title': [],
  'published-print': {'date-parts': [[2016]]},
  'DOI': '10.2991/mmme-16.2016.184',
  'type': 'proceedings-article',
  'created': {'date-parts': [[2016, 9, 25]],
   'date-time': '2016-09-25T17:41:52Z',
   'timestamp': 1474825312000},
  'source': 'Crossref',
  'is-referenced-by-count': 0,
  'title': ['Finite element analysis of portal frame under fire condition'],
  'prefix': '10.2991',
  'author': [{'given': 'Dong',
    'family': 'Chen',
    'sequence': 'first',
    'affiliation': []},
   {'given': 'Jinzhu',
    'family': 'Tang',
    'sequence': 'first',
    'affiliation': []}],
  'member': '1574',

In [72]:
journals = cr.journals()
print(journals)

{'status': 'ok', 'message-type': 'journal-list', 'message-version': '1.0.0', 'message': {'items-per-page': 20, 'query': {'start-index': 0, 'search-terms': None}, 'total-results': 130012, 'items': [{'last-status-check-time': 1712016402765, 'counts': {'current-dois': 350, 'backfile-dois': 12233, 'total-dois': 12583}, 'breakdowns': {'dois-by-issued-year': [[1999, 598], [2000, 524], [1996, 504], [1997, 484], [1998, 482], [2001, 462], [2002, 446], [2007, 443], [2006, 407], [2008, 404], [2004, 396], [2003, 371], [2009, 367], [2005, 362], [1995, 337], [2016, 299], [1993, 298], [1994, 297], [1992, 266], [2011, 263], [1990, 263], [1991, 253], [2010, 247], [1989, 228], [1988, 227], [2021, 225], [2014, 222], [2020, 212], [2017, 207], [2012, 207], [2019, 205], [1987, 186], [2018, 178], [2013, 165], [2022, 161], [2015, 155], [1986, 150], [2023, 147], [1984, 139], [1982, 133], [1985, 128], [1983, 115], [1981, 92], [1980, 78], [1979, 54], [1976, 53], [1975, 47], [2024, 42], [1977, 39], [1978, 15]]}, 

In [75]:
journals['message']['items'][0]

{'last-status-check-time': 1712016402765,
 'counts': {'current-dois': 350, 'backfile-dois': 12233, 'total-dois': 12583},
 'breakdowns': {'dois-by-issued-year': [[1999, 598],
   [2000, 524],
   [1996, 504],
   [1997, 484],
   [1998, 482],
   [2001, 462],
   [2002, 446],
   [2007, 443],
   [2006, 407],
   [2008, 404],
   [2004, 396],
   [2003, 371],
   [2009, 367],
   [2005, 362],
   [1995, 337],
   [2016, 299],
   [1993, 298],
   [1994, 297],
   [1992, 266],
   [2011, 263],
   [1990, 263],
   [1991, 253],
   [2010, 247],
   [1989, 228],
   [1988, 227],
   [2021, 225],
   [2014, 222],
   [2020, 212],
   [2017, 207],
   [2012, 207],
   [2019, 205],
   [1987, 186],
   [2018, 178],
   [2013, 165],
   [2022, 161],
   [2015, 155],
   [1986, 150],
   [2023, 147],
   [1984, 139],
   [1982, 133],
   [1985, 128],
   [1983, 115],
   [1981, 92],
   [1980, 78],
   [1979, 54],
   [1976, 53],
   [1975, 47],
   [2024, 42],
   [1977, 39],
   [1978, 15]]},
 'publisher': 'Wiley (John Wiley & Sons)',
 'cov

# OpenAlex

In [24]:
from pyalex import Works , Authors, Sources, Institutions, Topics, Publishers, Funders

In [13]:
example = Works()["https://doi.org/10.7717/peerj.4375"]

In [14]:
example["open_access"]

{'is_oa': True,
 'oa_status': 'gold',
 'oa_url': 'https://peerj.com/articles/4375.pdf',
 'any_repository_has_fulltext': True}

In [15]:
example["abstract"]

'Despite growing interest in Open Access (OA) to scholarly literature, there is an unmet need for large-scale, up-to-date, and reproducible studies assessing the prevalence and characteristics of OA. We address this need using oaDOI, an open online service that determines OA status for 67 million articles. We use three samples, each of 100,000 articles, to investigate OA in three populations: (1) all journal articles assigned a Crossref DOI, (2) recent journal articles indexed in Web of Science, and (3) articles viewed by users of Unpaywall, an open-source browser extension that lets users find OA articles using oaDOI. We estimate that at least 28% of the scholarly literature is OA (19M in total) and that this proportion is growing, driven particularly by growth in Gold and Hybrid. The most recent year analyzed (2015) also has the highest percentage of OA (45%). Because of this growth, and the fact that readers disproportionately access newer articles, we find that Unpaywall users enco

In [22]:
results, meta = Works().filter(is_oa=True,has_references=True,has_oa_accepted_or_published_version=True,has_abstract=True).get(return_meta=True)

In [23]:
meta

{'count': 23575688,
 'db_response_time_ms': 503,
 'page': 1,
 'per_page': 25,
 'groups_count': None}

In [25]:
Topics().get()

[{'id': 'https://openalex.org/T11475',
  'display_name': 'Territorial Governance and Environmental Participation',
  'description': 'This cluster of papers explores the intersection of territorial governance, environmental participation, and sustainable development. It delves into topics such as citizen participation, local development, social justice, and community engagement in the context of rural territories. The papers also discuss the role of proximity and innovation in shaping public policy for sustainable and inclusive development.',
  'keywords': ['Territorial Governance',
   'Environmental Participation',
   'Sustainable Development',
   'Citizen Participation',
   'Local Development',
   'Proximity and Innovation',
   'Rural Territories',
   'Public Policy',
   'Social Justice',
   'Community Engagement'],
  'ids': {'openalex': 'https://openalex.org/T11475',
   'wikipedia': 'https://en.wikipedia.org/wiki/Territorial_governance'},
  'subfield': {'id': 'https://openalex.org/su

In [32]:
for w in results:
    print("#"*30)
    print(w['display_name'])
    print([l['display_name'] for l in w['concepts'] ])
    print([l['display_name'] for l in w['topics'] ])
    print(len(w['related_works']))

##############################
PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT
['Reagent', 'Chemistry', 'Phenol', 'Chromatography', 'Organic chemistry']
['Glycosylation in Health and Disease', 'Protein Metabolism in Exercise and Nutrition', 'Oxidation States in Biochemistry and Medicine']
10
##############################
Deep Residual Learning for Image Recognition
['Residual', 'Computer science', 'Coco', 'Artificial intelligence', 'Object detection', 'Deep learning', 'Segmentation', 'Pattern recognition (psychology)', 'Set (abstract data type)', 'Layer (electronics)', 'Artificial neural network', 'Task (project management)', 'Machine learning', 'Residual neural network', 'Deep neural networks', 'Algorithm', 'Chemistry', 'Management', 'Organic chemistry', 'Economics', 'Programming language']
['Deep Learning in Computer Vision and Image Recognition', 'Advances in Transfer Learning and Domain Adaptation', 'Image Feature Retrieval and Recognition Techniques']
10
#######################

In [35]:
matSciTopics = Topics().search("Materials Science").get()

In [36]:
for w in matSciTopics:
    print(w['display_name'])

Materials Science and Technology
Materials Science and Engineering and Thermodynamics
Polymer Nanocomposites in Material Science
Railway Engineering and Material Science
Structural Engineering and Materials Science
Materials Science and Engineering in Military Applications
Accelerating Materials Innovation through Informatics
Advancements in Materials Science and Engineering
Mechanical Engineering and Materials Science Research
Atom Probe Tomography Research
N-Heterocyclic Carbenes in Catalysis and Materials Chemistry
Phase-Field Modeling of Microstructure Evolution
Fluid Dynamics and Engineering Applications
Power Electronics Technology
Glass Science and Technology
Metal Matrix Composites: Science and Applications
Nanotechnology and Nanoscale Science Education
Terahertz Technology and Applications
Sociomateriality in Information Systems Research
Analysis and Detection of Latent Fingerprints
Advanced Industrial Engineering and Technology Development
History of Education and Cultural Pr

# WikiData

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
wikidata = SPARQLWrapper("https://query.wikidata.org/sparql")

In [7]:
catQuery = """SELECT distinct ?categoryObj ?category {
    ?articleObj wdt:P31   wd:Q13442814 .
    ?articleObj wdt:P921  ?categoryObj .
  
    SERVICE wikibase:label    {
      bd:serviceParam wikibase:language "en" .
      ?categoryObj rdfs:label ?category .
    #  ?journalObj rdfs:label ?journal .
    #  ?article schema:description ?articleDescription .
    }
} limit 10"""

In [8]:
wikidata.setQuery(catQuery)
wikidata.setReturnFormat(JSON)
results = wikidata.query().convert()

EndPointInternalError: EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'SPARQL-QUERY: queryStr=SELECT distinct ?category {\n    ?articleObj wdt:P31   wd:Q13442814 .\n    ?articleObj wdt:P921  ?categoryObj .\n  \n    SERVICE wikibase:label    {\n      bd:serviceParam wikibase:language "en" .\n      ?categoryObj rdfs:label ?category .\n    #  ?journalObj rdfs:label ?journal .\n    #  ?article schema:description ?articleDescription .\n    }\n} limit 10\njava.util.concurrent.TimeoutException\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:205)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:687)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:865)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1655)\n\tat org.wikidata.query.rdf.blazegraph.throttling.ThrottlingFilter.doFilter(ThrottlingFilter.java:320)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.throttling.SystemOverloadFilter.doFilter(SystemOverloadFilter.java:82)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat ch.qos.logback.classic.helpers.MDCInsertingServletFilter.doFilter(MDCInsertingServletFilter.java:50)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.filters.QueryEventSenderFilter.doFilter(QueryEventSenderFilter.java:122)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.filters.ClientIPFilter.doFilter(ClientIPFilter.java:43)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.filters.JWTIdentityFilter.doFilter(JWTIdentityFilter.java:66)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.filters.RealAgentFilter.doFilter(RealAgentFilter.java:33)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.filters.RequestConcurrencyFilter.doFilter(RequestConcurrencyFilter.java:50)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1634)\n\tat org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:533)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:146)\n\tat org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:548)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:257)\n\tat org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1595)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:255)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1340)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:203)\n\tat org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:473)\n\tat org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1564)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:201)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1242)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:144)\n\tat org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:220)\n\tat org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:126)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)\n\tat org.eclipse.jetty.server.Server.handle(Server.java:503)\n\tat org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:364)\n\tat org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:260)\n\tat org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:305)\n\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103)\n\tat org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:118)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:333)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:310)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:168)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:126)\n\tat org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:366)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:765)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:683)\n\tat java.lang.Thread.run(Thread.java:750)\n'

In [10]:
results_df = pd.io.json.json_normalize(results['results']['bindings'])
results_df[['categoryObj.value', 'category.value']].head()

NameError: name 'pd' is not defined