In [1]:
#!pip install arxiv --upgrade --user

In [1]:
import pandas as pd
import arxiv

In [2]:
# i only use these if I want to remove annoying deprecation warnings from my analysis
import warnings
warnings.filterwarnings('ignore')

In [3]:
# troubleshooting: https://github.com/lukasschwab/arxiv.py/issues/43

def search_arxiv(query, max_results=10):

    data = {}
    i = 0
    
    client = arxiv.Client()

    search = arxiv.Search(query=query, max_results=max_results)
    
    results = client.results(search)

    #for result in search.results():
    for result in results:
        
        try:

            data[i] = {}

            data[i]['title'] = result.title
            data[i]['date_published'] = result.published
            data[i]['authors'] = [a.name for a in result.authors]
            data[i]['summary'] = result.summary
            data[i]['url'] = result.pdf_url
            data[i]['category'] = result.primary_category
        
        except:
            
            print('weird arxiv error')
        
        # there are more fields that can be added; add as many as you need

        i += 1

    df = pd.DataFrame(data).T
    df = df[['date_published', 'title', 'authors', 'summary', 'url', 'category']]
    df['date_published'] = pd.to_datetime(df['date_published'])
    df.sort_values('date_published', ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [4]:
# sometimes, it'll hit a weird error and crash; supposedly, that is on arxiv's end, not the python library used
# just reduce max_results. 100 goes very fast, 1000 goes slower, 10000 takes a few minutes, more than that is luck

query = 'OSINT'

max_results = 30000
    
df = search_arxiv(query, max_results) 
df.head()

Unnamed: 0,date_published,title,authors,summary,url,category
0,2023-10-03 11:42:29+00:00,Online Multimedia Verification with Computatio...,"[Sohail Ahmed Khan, Jan Gunnar Furuly, Henrik ...",This paper investigates the use of computation...,http://arxiv.org/pdf/2310.01978v1,cs.MM
1,2023-09-06 19:03:49+00:00,C-CLIP: Contrastive Image-Text Encoders to Clo...,"[William Theisen, Walter Scheirer]",The interplay between the image and comment on...,http://arxiv.org/pdf/2309.03921v1,cs.CV
2,2023-07-27 23:03:57+00:00,A Secure Open-Source Intelligence Framework Fo...,"[Sylvia Worlali Azumah, Victor Adewopo, Zag El...",Cyberbullying has become a pervasive issue bas...,http://arxiv.org/pdf/2307.15225v2,cs.CY
3,2023-06-09 18:18:58+00:00,The Use of Public Data and Free Tools in Natio...,"[Sharifah Roziah Binti Mohd Kassim, Shujun Li,...","Many CSIRTs, including national CSIRTs, routin...",http://arxiv.org/pdf/2306.07988v1,cs.DL
4,2023-04-24 09:53:33+00:00,ThreatCrawl: A BERT-based Focused Crawler for ...,"[Philipp Kuehn, Mike Schmidt, Markus Bayer, Ch...",Publicly available information contains valuab...,http://arxiv.org/pdf/2304.11960v2,cs.CR


In [5]:
df.shape

(16, 6)

In [6]:
outfile = 'data/arxiv_osint.csv'

df.to_csv(outfile, index=False)

In [None]:
# occasionally randomly crashes; need to find a fix; maybe others can help