In [1]:
#!pip install arxiv --upgrade --user

In [1]:
import pandas as pd
import arxiv

In [2]:
# i only use these if I want to remove annoying deprecation warnings from my analysis
import warnings
warnings.filterwarnings('ignore')

In [3]:
# troubleshooting: https://github.com/lukasschwab/arxiv.py/issues/43

def search_arxiv(query, max_results=10):

    data = {}
    i = 0
    
    client = arxiv.Client()

    search = arxiv.Search(query=query, max_results=max_results)
    
    results = client.results(search)

    #for result in search.results():
    for result in results:
        
        try:

            data[i] = {}

            data[i]['title'] = result.title
            data[i]['date_published'] = result.published
            data[i]['authors'] = [a.name for a in result.authors]
            data[i]['summary'] = result.summary
            data[i]['url'] = result.pdf_url
            data[i]['category'] = result.primary_category
        
        except:
            
            print('weird arxiv error')
        
        # there are more fields that can be added; add as many as you need

        i += 1

    df = pd.DataFrame(data).T
    df = df[['date_published', 'title', 'authors', 'summary', 'url', 'category']]
    df['date_published'] = pd.to_datetime(df['date_published'])
    df.sort_values('date_published', ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [10]:
# sometimes, it'll hit a weird error and crash; supposedly, that is on arxiv's end, not the python library used
# just reduce max_results. 100 goes very fast, 1000 goes slower, 10000 takes a few minutes, more than that is luck

query = 'Network Science'
query = 'Artificial Life'

max_results = 20000
    
df = search_arxiv(query, max_results) 
df.head()

Unnamed: 0,date_published,title,authors,summary,url,category
0,2023-11-09 16:53:55+00:00,A Comprehensive Survey of Threshold Digital Si...,"[Kiarash Sedghighadikolaei, Attila Altay Yavuz]",Threshold digital signatures enable a distribu...,http://arxiv.org/pdf/2311.05514v1,cs.CR
1,2023-11-09 09:12:31+00:00,Phosphatidylserine transport in cell life and ...,"[Alenka {Č}opi{č}, Thibaud Dieudonné, Guillaum...",Phosphatidylserine (PS) is a negatively-charge...,http://arxiv.org/pdf/2311.05223v1,q-bio.SC
2,2023-11-09 08:54:46+00:00,Tracking and Following a Suspended Moving Obje...,"[Michele Ambrosino, Manar Mahmalji, Nicolás Bo...",When robots are able to see and respond to the...,http://arxiv.org/pdf/2311.05213v1,cs.RO
3,2023-11-09 08:29:55+00:00,Green Resilience of Cyber-Physical Systems,[Diaeddin Rimawi],Cyber-Physical System (CPS) represents systems...,http://arxiv.org/pdf/2311.05201v1,cs.SE
4,2023-11-09 03:14:54+00:00,Quranic Conversations: Developing a Semantic S...,"[Yasser Shohoud, Maged Shoman, Sarah Abdelazim]",The Holy Book of Quran is believed to be the l...,http://arxiv.org/pdf/2311.05120v1,cs.CL


In [13]:
df.shape

(20000, 6)

In [14]:
outfile = 'data/arxiv_artificial_life.csv'

df.to_csv(outfile, index=False)

In [None]:
# occasionally randomly crashes; need to find a fix; maybe others can help