In [1]:
import dotenv
import os
import requests
import pandas as pd
import xmltodict
import cohere
import numpy as np

from pathlib import Path

In [2]:
PROJ_DIR = Path.cwd().parent
DOTENV_PATH = PROJ_DIR / '.env'
dotenv.load_dotenv(DOTENV_PATH)

True

In [204]:
def retrieve_arxiv_articles_df(num_articles=10):
    url = f"http://export.arxiv.org/api/query?search_query=cat:cs.AI+OR+cat:cs.CV+OR+cat:cs.LG+OR+cat:cs.MA+OR+cat:cs.NE+OR+cat:cs.RO+OR+cat:cs.SD&start=0&max_results={num_articles}"

    payload={}
    headers = {}

    response = requests.request("GET", url, headers=headers, data=payload)
    parsed_xml = xmltodict.parse(response.text)
    
    articles_dict = {'link': [],
                     'updated_ts': [],
                     'published_ts': [],
                     'title': [],
                     'summary': [],
                     'author': [],
                     'category': [],
                    }
    
    for article in parsed_xml['feed']['entry']:
        try:
            articles_dict['link'].append(article['id'])
            articles_dict['updated_ts'].append(article['updated'])
            articles_dict['published_ts'].append(article['published'])
            articles_dict['title'].append(article['title'])
            articles_dict['summary'].append(article['summary'])
            articles_dict['author'].append(", ".join([author['name'] for author in article['author']]) if isinstance(article['author'], list) else article['author']['name'])
            articles_dict['category'].append(article['arxiv:primary_category']['@term'])
        except Exception as e:
            print(f"Exception: {e}")
            continue

    articles_df = pd.DataFrame.from_dict(articles_dict)
    
    return articles_df, parsed_xml

In [205]:
%%time
articles_df, parsed_xml = retrieve_arxiv_articles_df(num_articles=1000)

CPU times: user 157 ms, sys: 10.8 ms, total: 168 ms
Wall time: 6.69 s


In [200]:
parsed_xml

{'feed': {'@xmlns': 'http://www.w3.org/2005/Atom',
  'link': {'@href': 'http://arxiv.org/api/query?search_query%3Dcat%3Acs.AI%20OR%20cat%3Acs.CV%20OR%20cat%3Acs.LG%20OR%20cat%3Acs.MA%20OR%20cat%3Acs.NE%20OR%20cat%3Acs.RO%20OR%20cat%3Acs.SD%26id_list%3D%26start%3D0%26max_results%3D1000',
   '@rel': 'self',
   '@type': 'application/atom+xml'},
  'title': {'@type': 'html',
   '#text': 'ArXiv Query: search_query=cat:cs.AI OR cat:cs.CV OR cat:cs.LG OR cat:cs.MA OR cat:cs.NE OR cat:cs.RO OR cat:cs.SD&id_list=&start=0&max_results=1000'},
  'id': 'http://arxiv.org/api/YrhbSbUjZDECGfakQABax52OiAA',
  'updated': '2023-01-29T00:00:00-05:00',
  'opensearch:totalResults': {'@xmlns:opensearch': 'http://a9.com/-/spec/opensearch/1.1/',
   '#text': '246955'},
  'opensearch:startIndex': {'@xmlns:opensearch': 'http://a9.com/-/spec/opensearch/1.1/',
   '#text': '0'},
  'opensearch:itemsPerPage': {'@xmlns:opensearch': 'http://a9.com/-/spec/opensearch/1.1/',
   '#text': '1000'},
  'entry': [{'id': 'http://a

In [203]:
articles_df

Unnamed: 0,link,updated_ts,published_ts,title,summary,author,category
0,http://arxiv.org/abs/1709.06620v1,2017-09-19T19:26:20Z,2017-09-19T19:26:20Z,Learning of Coordination Policies for Robotic ...,"Inspired by biological swarms, robotic swarms ...","Qiyang Li, Xintong Du, Yizhou Huang, Quinlan S...",cs.RO
1,http://arxiv.org/abs/2011.05605v2,2020-11-20T18:19:32Z,2020-11-11T07:35:21Z,Decentralized Motion Planning for Multi-Robot ...,This work presents a decentralized motion plan...,"Sivanathan Kandhasamy, Vinayagam Babu Kuppusam...",cs.RO
2,http://arxiv.org/abs/2209.14745v2,2022-12-29T08:48:05Z,2022-09-29T13:02:58Z,A Multiagent Framework for the Asynchronous an...,The traditional ML development methodology doe...,Andrea Gesmundo,cs.LG
3,http://arxiv.org/abs/2011.02608v1,2020-11-05T01:47:23Z,2020-11-05T01:47:23Z,Learning a Decentralized Multi-arm Motion Planner,We present a closed-loop multi-arm motion plan...,"Huy Ha, Jingxi Xu, Shuran Song",cs.RO
4,http://arxiv.org/abs/2012.05894v1,2020-12-10T18:55:51Z,2020-12-10T18:55:51Z,AutoSelect: Automatic and Dynamic Detection Se...,3D multi-object tracking is an important compo...,"Xinshuo Weng, Kris Kitani",cs.CV
...,...,...,...,...,...,...,...
995,http://arxiv.org/abs/1409.2399v1,2014-09-08T15:38:31Z,2014-09-08T15:38:31Z,Prioritized Planning Algorithms for Trajectory...,An important capability of autonomous multi-ro...,"Michal Čáp, Peter Novák, Alexander Kleiner, Ma...",cs.RO
996,http://arxiv.org/abs/1502.06030v1,2015-02-20T22:56:00Z,2015-02-20T22:56:00Z,Decentralized Control of Partially Observable ...,The focus of this paper is on solving multi-ro...,"Shayegan Omidshafiei, Ali-akbar Agha-mohammadi...",cs.MA
997,http://arxiv.org/abs/1503.00237v1,2015-03-01T08:33:28Z,2015-03-01T08:33:28Z,Task Allocation in Robotic Swarms: Explicit Co...,In this paper we study multi robot cooperative...,"Aryo Jamshidpey, Mohsen Afsharchi",cs.MA
998,http://arxiv.org/abs/1604.05942v1,2016-04-20T13:12:45Z,2016-04-20T13:12:45Z,Multiplayer Games for Learning Multirobot Coor...,Humans have an impressive ability to solve com...,"Arash Tavakoli, Haig Nalbandian, Nora Ayanian",cs.RO
