In [None]:
import requests
import time
import json

# main category:
category = "nucl-th" # hep-lat
start_year=1995
end_year=2025
sample_per_year = 100

base_url = "https://inspirehep.net/api/literature"


# To get Number of papers
query = f"primary_arxiv_category:{category}"
params = {'q': query, 'size': sample_per_year, 'page':1}
response = requests.get(base_url, params)

all_samples = []


if response.status_code == 200:
    # Check number of papers
    data = response.json()
    total_hits = data['hits']['total']
    print(f"Number of papers in {category}: {total_hits}")


    # get sample_per_year papers for each year
    for year in range(start_year, end_year+1):
        print(f"year: {year}")

        query = f"primary_arxiv_category:{category} and year: {year}"


        params = {'q': query,
                  'size':sample_per_year,
                  'page':1,
                  'sort': 'random',
                  'seed': 42}

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status() # except when error

            data = response.json()
            hits = data['hits']['hits']

            all_samples.extend(hits)
            print(f"{year} is done, until now: {len(all_samples)}")

            # To avoid overloading the server, add a short delay between request
            time.sleep(0.5)

        except requests.exceptions.RequestException as e:
            print(f"Error occurs during year {year}: {e}")
            break

else:
    print(f"Failed on initial API request: {response.status_code}")
    print(response.text)


In [None]:
# Need to remove duplicates because the above code might save different versions of the same paper.

print(f"Before removing duplicates: {len(all_samples)}")

unique_papers = []
seen_ids = set()

for paper in all_samples:
    paper_id = paper['id']
    if paper_id not in seen_ids:
        unique_papers.append(paper)
        seen_ids.add(paper_id)

print(f"After removing duplicates: {len(unique_papers)}")

In [None]:
with open('../data/data_nucl-th_100.json', 'w') as f:
   json.dump(unique_papers, f, ensure_ascii=False, indent=2)

# Import the data

In [1]:
import pandas as pd

df = pd.read_json("../data/data_nucl-th_100.json")

In [2]:
df.head()

Unnamed: 0,id,links,updated,metadata,created
0,1459726,{'bibtex': 'https://inspirehep.net/api/literat...,2024-05-07T08:50:24.810153+00:00,{'publication_info': [{'journal_volume': '594'...,2016-05-18T00:00:00+00:00
1,1459842,{'bibtex': 'https://inspirehep.net/api/literat...,2024-05-07T09:32:13.195422+00:00,{'publication_info': [{'journal_volume': '351'...,2016-05-18T00:00:00+00:00
2,1460442,{'bibtex': 'https://inspirehep.net/api/literat...,2024-05-07T09:32:06.686143+00:00,{'publication_info': [{'journal_volume': '344'...,2016-05-19T00:00:00+00:00
3,392718,{'bibtex': 'https://inspirehep.net/api/literat...,2023-03-06T17:45:25.590277+00:00,{'publication_info': [{'journal_volume': '340'...,1995-02-14T00:00:00+00:00
4,397533,{'bibtex': 'https://inspirehep.net/api/literat...,2024-05-07T10:36:27.392018+00:00,{'publication_info': [{'journal_volume': '363'...,1995-07-25T00:00:00+00:00


In [3]:
df.loc[0, 'metadata']

{'publication_info': [{'journal_volume': '594',
   'page_end': '136',
   'year': 1995,
   'page_start': '117',
   'journal_title': 'Nucl.Phys.A'}],
 'citation_count_without_self_citations': 10,
 'citation_count': 20,
 'authors': [{'raw_affiliations': [{'value': "Departament d'Estructura i Constituents de la Matèria, Universitat de Barcelona, Diagonal 647, E-08028 Barcelona, Spain"}],
   'full_name_unicode_normalized': 'polls, a.',
   'full_name': 'Polls, A.',
   'record': {'$ref': 'https://inspirehep.net/api/authors/1034953'},
   'ids': [{'schema': 'INSPIRE BAI', 'value': 'A.Polls.1'}],
   'last_name': 'Polls',
   'signature_block': 'PALa',
   'first_name': 'A.',
   'uuid': '4c070c40-d8a6-4597-8319-0bff522901c4',
   'recid': 1034953},
  {'raw_affiliations': [{'value': 'Institut für Theoretische Physik, Universität Tübingen, Auf der Morgenstelle 14, D-72076 Tübingen, Germany'}],
   'full_name_unicode_normalized': 'müther, h.',
   'full_name': 'Müther, H.',
   'record': {'$ref': 'https:/