In [1]:
#!pip install arxiv --upgrade --user

In [2]:
import pandas as pd
import arxiv

In [3]:
# i only use these if I want to remove annoying deprecation warnings from my analysis
import warnings
warnings.filterwarnings('ignore')

In [4]:
# troubleshooting: https://github.com/lukasschwab/arxiv.py/issues/43

def search_arxiv(query, max_results=10):

    data = {}
    i = 0
    
    client = arxiv.Client()

    search = arxiv.Search(query=query, max_results=max_results)
    
    results = client.results(search)

    #for result in search.results():
    for result in results:
        
        try:

            data[i] = {}

            data[i]['title'] = result.title
            data[i]['date_published'] = result.published
            data[i]['authors'] = [a.name for a in result.authors]
            data[i]['summary'] = result.summary
            data[i]['url'] = result.pdf_url
            data[i]['category'] = result.primary_category
        
        except:
            
            print('weird arxiv error')
        
        # there are more fields that can be added; add as many as you need

        i += 1

    df = pd.DataFrame(data).T
    df = df[['date_published', 'title', 'authors', 'summary', 'url', 'category']]
    df['date_published'] = pd.to_datetime(df['date_published'])
    df.sort_values('date_published', ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [15]:
# sometimes, it'll hit a weird error and crash; supposedly, that is on arxiv's end, not the python library used
# just reduce max_results. 100 goes very fast, 1000 goes slower, 10000 takes a few minutes, more than that is luck

query = 'artificial life'

max_results = 2000
    
df = search_arxiv(query, max_results) 
df.head()

Unnamed: 0,date_published,title,authors,summary,url,category
0,2024-03-21 10:28:18+00:00,Multi-role Consensus through LLMs Discussions ...,"[Zhenyu Mao, Jialong Li, Munan Li, Kenji Tei]",Recent advancements in large language models (...,http://arxiv.org/pdf/2403.14274v1,cs.SE
1,2024-03-21 07:42:07+00:00,An Agnostic Biosignature Based on Modeling Pan...,"[Harrison B. Smith, Lana Sinapayen]",A fundamental goal of astrobiology is to detec...,http://arxiv.org/pdf/2403.14195v1,astro-ph.EP
2,2024-03-19 14:26:52+00:00,On Equivalence of Likelihood-Based Confidence ...,"[Peng Liu, Yili Hong, Luis A. Escobar, William...",Fatigue data arise in many research and applie...,http://arxiv.org/pdf/2403.12757v1,stat.ME
3,2024-03-19 02:25:29+00:00,Characteristic AI Agents via Large Language Mo...,"[Xi Wang, Hongliang Dai, Shen Gao, Piji Li]",The advancement of Large Language Models (LLMs...,http://arxiv.org/pdf/2403.12368v1,cs.CL
4,2024-03-15 16:20:51+00:00,Data Ethics Emergency Drill: A Toolbox for Dis...,"[Vanessa Aisyahsari Hanschke, Dylan Rees, Merv...",Researchers urge technology practitioners such...,http://arxiv.org/pdf/2403.10438v1,cs.HC


In [16]:
df.shape

(2000, 6)

In [17]:
outfile = 'data/arxiv_artificial_life.csv'

df.to_csv(outfile, index=False)

In [None]:
# occasionally randomly crashes; need to find a fix; maybe others can help