In [87]:
import json

In [88]:
with open("../data/papers_arxiv_v1.json", 'r') as f:
    data = json.load(f)

In [89]:
len(list(data.keys()))

803

In [14]:
data["abs/2110.08861v2"].keys()

dict_keys(['id', 'title', 'authors', 'abstract', 'arxiv_id', 'category'])

In [24]:
import urllib.parse
from typing import Dict, List, Tuple, Union
import requests

In [19]:
ARXIV_LINK = "http://export.arxiv.org/api/query?search_query="

In [23]:
def construct_arxiv_link(
    paper_title: Union[str, None] = None,
    author: Union[str, None] = None,
    abstract: Union[str, None] = None,
    start_idx: int = 0,
    max_results: int = 100,
) -> str:
    """The function that constructs Arxiv link to request the Arxiv API

    Parameters
    ----------
    paper_title : Union[str, None], optional
        Title of the paper as requested by the user, by default None
    author : Union[str, None], optional
        Author of the paper as requested by the user, by default None
    abstract : Union[str, None], optional
        A keyword from Abstract of the paper as requested by the user, by
        default None
    start_idx : int, optional
        Start index for pagination, by default 0
    max_results : int, optional
        End index of the pagination, by default 100

    Returns
    -------
    str
        Arxiv link
    """
    param_dict = {
        "ti": paper_title,
        "au": author,
        "abs": abstract,
    }
    str_query = ""
    query_list = []
    for k, v in param_dict.items():
        if v is not None:
            value_str = " ".join(v.split(" "))
            value_str = f"%22{value_str}%22"
            query_list.append(f"{k}:{value_str}")
    str_query = "+AND+".join(query_list)
    str_query = urllib.parse.quote_plus(str_query)

    str_query += f"&sortBy=relevance&sortOrder=descending&start={start_idx}&max_results={max_results}"
    return ARXIV_LINK + str_query

In [32]:
link_arxiv = construct_arxiv_link(
            paper_title="Attention is all you need", author=None, abstract=None, max_results=100
        )

In [33]:
link_arxiv

'http://export.arxiv.org/api/query?search_query=ti%3A%2522Attention+is+all+you+need%2522&sortBy=relevance&sortOrder=descending&start=0&max_results=100'

In [34]:
response = requests.get(link_arxiv)

In [35]:
import xml.etree.ElementTree as ET

In [36]:
NAMESPACE = {"n": "http://www.w3.org/2005/Atom"}

In [37]:
xmlstring = response.text
tree = ET.ElementTree(ET.fromstring(xmlstring))
tree_root = tree.getroot()
all_papers = tree_root.findall("n:entry", namespaces=NAMESPACE)

In [43]:
paper = list(all_papers)[0]

In [75]:
categories = paper.findall("n:category", namespaces=NAMESPACE)

In [76]:
[f.get("term") for f in categories]

['cs.CL', 'cs.LG']

In [74]:
categories

<Element '{http://www.w3.org/2005/Atom}category' at 0x7f0adda8c470>

In [70]:
c.get("term").

'cs.CL'

In [61]:
c = cats[0]

In [62]:
c.

In [47]:
list(paper)

[<Element '{http://www.w3.org/2005/Atom}id' at 0x7f0addaab6b0>,
 <Element '{http://www.w3.org/2005/Atom}updated' at 0x7f0addaab770>,
 <Element '{http://www.w3.org/2005/Atom}published' at 0x7f0addaab830>,
 <Element '{http://www.w3.org/2005/Atom}title' at 0x7f0addaab890>,
 <Element '{http://www.w3.org/2005/Atom}summary' at 0x7f0addaab950>,
 <Element '{http://www.w3.org/2005/Atom}author' at 0x7f0addaaba10>,
 <Element '{http://www.w3.org/2005/Atom}author' at 0x7f0addaabb90>,
 <Element '{http://www.w3.org/2005/Atom}author' at 0x7f0addaabc50>,
 <Element '{http://www.w3.org/2005/Atom}author' at 0x7f0addaabd10>,
 <Element '{http://www.w3.org/2005/Atom}author' at 0x7f0addaabdd0>,
 <Element '{http://www.w3.org/2005/Atom}author' at 0x7f0addaabe90>,
 <Element '{http://www.w3.org/2005/Atom}author' at 0x7f0addaabf50>,
 <Element '{http://www.w3.org/2005/Atom}author' at 0x7f0adda8c050>,
 <Element '{http://arxiv.org/schemas/atom}comment' at 0x7f0adda8c170>,
 <Element '{http://www.w3.org/2005/Atom}link'