In [1]:
#!pip install arxiv --upgrade --user

In [2]:
import pandas as pd
import arxiv

In [3]:
# i only use these if I want to remove annoying deprecation warnings from my analysis
import warnings
warnings.filterwarnings('ignore')

In [4]:
# troubleshooting: https://github.com/lukasschwab/arxiv.py/issues/43

def search_arxiv(query, max_results=10):

    data = {}
    i = 0
    
    client = arxiv.Client()

    search = arxiv.Search(query=query, max_results=max_results)
    
    results = client.results(search)

    #for result in search.results():
    for result in results:
        
        try:

            data[i] = {}

            data[i]['title'] = result.title
            data[i]['date_published'] = result.published
            data[i]['authors'] = [a.name for a in result.authors]
            data[i]['summary'] = result.summary
            data[i]['url'] = result.pdf_url
            data[i]['category'] = result.primary_category
        
        except:
            
            print('weird arxiv error')
        
        # there are more fields that can be added; add as many as you need

        i += 1

    df = pd.DataFrame(data).T
    df = df[['date_published', 'title', 'authors', 'summary', 'url', 'category']]
    df['date_published'] = pd.to_datetime(df['date_published'])
    df.sort_values('date_published', ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [12]:
# sometimes, it'll hit a weird error and crash; supposedly, that is on arxiv's end, not the python library used
# just reduce max_results. 100 goes very fast, 1000 goes slower, 10000 takes a few minutes, more than that is luck

query = 'Network Science'

max_results = 100
    
df = search_arxiv(query, max_results) 
df.head()

Unnamed: 0,date_published,title,authors,summary,url,category
0,2023-10-16 00:41:13+00:00,Network Analysis of the iNaturalist Citizen Sc...,"[Yu Lu Liu, Thomas Jiralerspong]","In recent years, citizen science has become a ...",http://arxiv.org/pdf/2310.10693v1,cs.SI
1,2023-05-05 18:55:32+00:00,Materials Informatics: An Algorithmic Design Rule,[Bhupesh Bishnoi],"Materials informatics, data-enabled investigat...",http://arxiv.org/pdf/2305.03797v1,cond-mat.mtrl-sci
2,2023-01-12 06:03:57+00:00,A Network Science perspective of Graph Convolu...,"[Mingshan Jia, Bogdan Gabrys, Katarzyna Musial]",The mining and exploitation of graph structura...,http://arxiv.org/pdf/2301.04824v1,cs.SI
3,2022-08-25 08:27:36+00:00,Motif-Based Visual Analysis of Dynamic Networks,"[Eren Cakmak, Johannes Fuchs, Dominik Jäckle, ...",Many data analysis problems rely on dynamic ne...,http://arxiv.org/pdf/2208.11932v1,cs.SI
4,2022-08-20 11:25:54+00:00,From Time Series to Networks in R with the ts2...,[Leonardo N. Ferreira],Network science established itself as a promin...,http://arxiv.org/pdf/2208.09660v1,cs.SI


In [13]:
df.shape

(100, 6)

In [11]:
outfile = 'data/arxiv_network_science_data.csv'

df.to_csv(outfile, index=False)

In [None]:
# occasionally randomly crashes; need to find a fix; maybe others can help