In [91]:
import arxiv
from arxiv import Client, Search, SortCriterion
from datetime import datetime, timedelta
import requests
import json
import time

In [92]:
# Construct the default API client.
client = Client()

### arXiv Python API: Example Usage

In [93]:
# # Search for the 10 most recent articles matching the keyword "quantum."
# search = Search(
#   query = "cat:cs.RO",
#   max_results = 100,
#   sort_by = SortCriterion.SubmittedDate
# )

# results = client.results(search)

# # `results` is a generator; you can iterate over its elements one by one...
# for r in client.results(search):
#   print(r.title)
# # ...or exhaust it into a list. Careful: this is slow for large results sets.
# all_results = list(results)
# print([r.title for r in all_results])

In [94]:
# # For advanced query syntax documentation, see the arXiv API User Manual:
# # https://arxiv.org/help/api/user-manual#query_details
# search = Search(query = "au:del_maestro AND ti:checkerboard")
# first_result = next(client.results(search))
# print(first_result)

# # Search for the paper with ID "1605.08386v1"
# search_by_id = Search(id_list=["1605.08386v1"])
# # Reuse client to fetch the paper, then print its title.
# first_result = next(client.results(search))
# print(first_result.title)

### Papers in a Specified Date Range

In [95]:
def search_arxiv_by_date_range(query, start_date, end_date, sort_by="citations"):
    # Format the date for arXiv search
    end_date_str = end_date.strftime('%Y%m%d')
    start_date_str = (start_date + timedelta(days=-1)).strftime('%Y%m%d')

    # Create a search query with the date filter
    # search_query = f"{query} AND submittedDate:[{formatted_date} TO {formatted_date}]"
    search_query = f"{query} AND submittedDate:[{start_date_str} TO {end_date_str}]"

    # Perform the search
    search = Search(
        query=search_query,
        max_results=100,
        sort_by=SortCriterion.SubmittedDate
    )

    # Iterate over the results
    results = []
    for result in client.results(search):
        results.append(result)

    # TODO sort by citations, or any other condition
    if sort_by == "citations":
        pass # TODO
        
    elif sort_by == "recency": 
        results = sorted(results, key=lambda x: x.published.date())

    return results

In [96]:
search_arxiv_by_date_range("cat:cs.RO", datetime.fromisoformat("2024-06-01").date(), datetime.fromisoformat("2024-06-01").date())

[arxiv.Result(entry_id='http://arxiv.org/abs/2406.00211v1', updated=datetime.datetime(2024, 5, 31, 21, 50, 42, tzinfo=datetime.timezone.utc), published=datetime.datetime(2024, 5, 31, 21, 50, 42, tzinfo=datetime.timezone.utc), title='Navigating Autonomous Vehicle on Unmarked Roads with Diffusion-Based Motion Prediction and Active Inference', authors=[arxiv.Result.Author('Yufei Huang'), arxiv.Result.Author('Yulin Li'), arxiv.Result.Author('Andrea Matta'), arxiv.Result.Author('Mohsen Jafari')], summary="This paper presents a novel approach to improving autonomous vehicle control\nin environments lacking clear road markings by integrating a diffusion-based\nmotion predictor within an Active Inference Framework (AIF). Using a simulated\nparking lot environment as a parallel to unmarked roads, we develop and test\nour model to predict and guide vehicle movements effectively. The\ndiffusion-based motion predictor forecasts vehicle actions by leveraging\nprobabilistic dynamics, while AIF aids 

### Searching for Semantic Scholar Article by Title

In [134]:
def search_semantic_scholar(title, author=None, api_key=None):
    url = 'https://api.semanticscholar.org/graph/v1/paper/search'
    query = title
    if author:
        query += f" {author}"
    
    params = {
        'query': query,
        # 'fields': 'title,authors,externalIds,corpusId',
        'fields': 'title,authors,externalIds,corpusId,citationCount',
        'limit': 1
    }
    
    # headers = {}
    # if api_key:
    #     headers['x-api-key'] = api_key
    
    # response = requests.get(url, params=params, headers=headers)
    response = requests.get(url, params=params)
    data = response.json()
    
    if 'data' in data and len(data['data']) > 0:
        paper = data['data'][0]
        return paper
    
    return None

In [145]:
for title in titles:
    print(search_semantic_scholar(title))
    # paper = search_semantic_scholar(title)
    # if paper is None: continue
    # print(f"{paper['title']} (cited by {paper['citationCount']})")
    time.sleep(1)

{'paperId': '7ba8b89bf4f915e89bcdccdcdea9e52f7739bfc2', 'externalIds': {'ArXiv': '2406.00518', 'CorpusId': 270214475}, 'corpusId': 270214475, 'title': 'Learning to Play Air Hockey with Model-Based Deep Reinforcement Learning', 'citationCount': 1, 'authors': [{'authorId': '2179879892', 'name': 'Andrej Orsula'}]}
{'paperId': 'b0bef6fbb26d3b0540df8d52fc57c831c791e0ce', 'externalIds': {'ArXiv': '2406.00504', 'CorpusId': 270217757}, 'corpusId': 270217757, 'title': 'Research on an Autonomous UAV Search and Rescue System Based on the Improved', 'citationCount': 0, 'authors': [{'authorId': '2164731402', 'name': 'Haobin Chen'}, {'authorId': '2304445355', 'name': 'Junyu Tao'}, {'authorId': '2304673169', 'name': 'Bize Zhou'}, {'authorId': '2304922034', 'name': 'Xiaoyan Liu'}]}
{'paperId': '5115c12b124424f50b8576adc4df83b7c412daf3', 'externalIds': {'ArXiv': '2406.00485', 'CorpusId': 270218671}, 'corpusId': 270218671, 'title': 'TacShade A New 3D-printed Soft Optical Tactile Sensor Based on Light, S

In [133]:
url = 'https://api.semanticscholar.org/graph/v1/paper/search'
query = "Learning to Play Air Hockey with Model-Based Deep Reinforcement Learning"

params = {
    'query': query,
    'fields': 'title,authors,externalIds,corpusId',
    'limit': 1
}

# headers = {}
# if api_key:
#     headers['x-api-key'] = api_key

# response = requests.get(url, params=params, headers=headers)
response = requests.get(url, params=params)
data = response.json()

if 'data' in data and len(data['data']) > 0:
    paper = data['data'][0]

In [114]:
data

{'total': 20,
 'offset': 0,
 'next': 1,
 'data': [{'paperId': '7ba8b89bf4f915e89bcdccdcdea9e52f7739bfc2',
   'externalIds': {'ArXiv': '2406.00518', 'CorpusId': 270214475},
   'corpusId': 270214475,
   'title': 'Learning to Play Air Hockey with Model-Based Deep Reinforcement Learning',
   'authors': [{'authorId': '2179879892', 'name': 'Andrej Orsula'}]}]}

### Batch-Retrieving Citation Counts w/ Semantic Scholar API 

In [99]:
# import requests
# import time

# def get_citation_counts(paper_ids, api_key):
#     url = 'https://api.semanticscholar.org/graph/v1/paper/batch'
#     headers = {'x-api-key': api_key}
    
#     # Prepare the request payload
#     request_payload = {
#         "ids": paper_ids,
#         "fields": "title,citationCount"
#     }

#     response = requests.post(url, headers=headers, json=request_payload)
#     data = response.json()
    
#     citation_counts = {}
#     for paper in data['data']:
#         title = paper.get('title', 'Unknown Title')
#         citation_count = paper.get('citationCount', 0)
#         citation_counts[title] = citation_count
    
#     return citation_counts

# # Example usage
# if __name__ == "__main__":
#     api_key = 'YOUR_API_KEY'  # Replace with your Semantic Scholar API key
#     paper_ids = [
#         # "10.1109/5.771073",  # Replace with actual DOIs or paper IDs
#         # "10.1038/nature24271",
#         "2406.00518",
#         "2406.00504",
#         "2406.00485",
#         "2406.00451",
#         "2406.00447v1",
#         "2406.00439v1",
#         "2406.00430v1",
#         "2406.00375v1",
#         "2406.00364v1",
#         "2406.00315v1",
#         "2406.00313v2",
#         "2406.00312v1",
#         # Add more paper DOIs or IDs as needed
#     ]
    
#     # Batch the requests to avoid overloading
#     batch_size = 10
#     all_citation_counts = {}
    
#     for i in range(0, len(paper_ids), batch_size):
#         batch = paper_ids[i:i + batch_size]
#         citation_counts = get_citation_counts(batch, api_key)
#         all_citation_counts.update(citation_counts)
#         time.sleep(3)  # Add delay to avoid hitting rate limits

#     for title, count in all_citation_counts.items():
#         print(f"Citation count for '{title}': {count}")


In [100]:
# url = 'https://api.semanticscholar.org/graph/v1/paper/batch'
# # headers = {'x-api-key': api_key}

# # paper_ids = [paper.entry_id.split("/")[-1] for paper in sorted_data]
# paper_ids = [paper.entry_id for paper in sorted_data]

# # Prepare the request payload
# request_payload = {
#     "ids": paper_ids,
#     "fields": "title,citationCount"
# }

# # response = requests.post(url, headers=headers, json=request_payload)
# response = requests.post(url, json=request_payload)
# data = response.json()

# citation_counts = {}
# for paper in data['data']:
#     title = paper.get('title', 'Unknown Title')
#     citation_count = paper.get('citationCount', 0)
#     citation_counts[title] = citation_count

# citation_counts

In [101]:
def search_semantic_scholar_batch(titles, api_key=None):
    url = 'https://api.semanticscholar.org/graph/v1/paper/batch'
    
    # Prepare the request payload
    request_payload = {
        "queries": [{"query": title} for title in titles],
        "fields": "title,authors,doi,corpusId"
    }
    
    headers = {}
    if api_key:
        headers['x-api-key'] = api_key
    
    response = requests.post(url, headers=headers, json=request_payload)
    data = response.json()
    
    return data['data']

In [102]:
titles = [paper.title for paper in results]
# search_semantic_scholar_batch(titles)
url = 'https://api.semanticscholar.org/graph/v1/paper/batch'

# Prepare the request payload
request_payload = {
    "queries": [{"query": title} for title in titles],
    "fields": "title,authors,doi,corpusId"
}

# headers = {}
# if api_key:
#     headers['x-api-key'] = api_key

# response = requests.post(url, headers=headers, json=request_payload)
response = requests.post(url, json=request_payload)
data = response.json()

In [103]:
data

{'error': 'Invalid input JSON'}

In [104]:
[{"query": title} for title in titles]

[{'query': 'Learning to Play Air Hockey with Model-Based Deep Reinforcement Learning'},
 {'query': 'Research on an Autonomous UAV Search and Rescue System Based on the Improved'},
 {'query': 'TacShade A New 3D-printed Soft Optical Tactile Sensor Based on Light, Shadow and Greyscale for Shape Reconstruction'},
 {'query': 'Task Planning for Object Rearrangement in Multi-room Environments'},
 {'query': 'DroneVis: Versatile Computer Vision Library for Drones'},
 {'query': 'Learning Manipulation by Predicting Interaction'},
 {'query': 'Evaluating Uncertainty-based Failure Detection for Closed-Loop LLM Planners'},
 {'query': 'Teledrive: An Embodied AI based Telepresence System'},
 {'query': 'Cognitive Manipulation: Semi-supervised Visual Representation and Classroom-to-real Reinforcement Learning for Assembly in Semi-structured Environments'},
 {'query': 'Precision and Adaptability of YOLOv5 and YOLOv8 in Dynamic Robotic Environments'},
 {'query': 'From Seedling to Harvest: The GrowingSoy Da

In [105]:

response = requests.get("https://api.semanticscholar.org/graph/v1/paper/search?query=semantic%20scholar%20platform&limit=3")

In [106]:
json.loads(response.content)

{'total': 33619,
 'offset': 0,
 'next': 3,
 'data': [{'paperId': 'cb92a7f9d9dbcf9145e32fdfa0e70e2a6b828eb1',
   'title': 'The Semantic Scholar Open Data Platform'},
  {'paperId': '0be97f920c0370c2a1f0784e49b7513d93f4436a',
   'title': 'The JSTOR Academic Knowledge Graph: Structural Analysis and Primary Services and Applications with Reference to the Semantic Scholar Open Data Platform'},
  {'paperId': '6648762d7755511993eec4177a3badfc37b77937',
   'title': 'Implementation of Open Scholar Platform and Integration of Open Resources in National Taiwan Normal University (NTNU)'}]}

In [107]:
https://api.semanticscholar.org/graph/v1/paper/search?query=semantic%20scholar%20platform&limit=3

SyntaxError: invalid decimal literal (2402333809.py, line 1)

### Scratchpad

In [153]:
type(results[0])

arxiv.Result

In [143]:
dir(results[0])

['Author',
 'Link',
 'MissingFieldError',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_from_feed_entry',
 '_get_default_filename',
 '_get_pdf_url',
 '_raw',
 '_to_datetime',
 'authors',
 'categories',
 'comment',
 'doi',
 'download_pdf',
 'download_source',
 'entry_id',
 'get_short_id',
 'journal_ref',
 'links',
 'pdf_url',
 'primary_category',
 'published',
 'summary',
 'title',
 'updated']

In [150]:
dir(results[0])

['Author',
 'Link',
 'MissingFieldError',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_from_feed_entry',
 '_get_default_filename',
 '_get_pdf_url',
 '_raw',
 '_to_datetime',
 'authors',
 'categories',
 'comment',
 'doi',
 'download_pdf',
 'download_source',
 'entry_id',
 'get_short_id',
 'journal_ref',
 'links',
 'pdf_url',
 'primary_category',
 'published',
 'summary',
 'title',
 'updated']

In [152]:
results[0].__format__("")

TypeError: object.__format__() takes exactly one argument (0 given)