In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [3]:
headers = {
    "Accept-Language":"en-US,en;q=0.9",
    "User-Agent":"Mozilla/5.0 (Macintosh; \
    Intel Mac OS X 10_15_7) \
    AppleWebKit/537.36 (KHTML, like Gecko) \
    Chrome/98.0.4758.102 Safari/537.36"
}


In [8]:
# res = requests.get('https://www.ssrn.com/index.cfm/en/', headers=headers)
res = requests.get('https://arxiv.org/', headers=headers)

In [20]:
import feedparser

In [61]:
# Test case 1
f_parse1 = feedparser.parse('http://export.arxiv.org/api/query?search_query=ti:\
"electron%20thermal%20conductivity"&sortBy=lastUpdatedDate&sortOrder=ascending')

In [93]:
# Test case 2
f_parse2 = feedparser.parse('http://export.arxiv.org/api/query?sea\
rch_query=cat:cs.*+OR+cat:econ.*+OR+cat:stat.ML&sortBy=lastUpdatedDate&sortOrder=descending')

In [165]:
# Define collection of recent arXiv papers by category / collection of categories
# Reference for category names: https://arxiv.org/category_taxonomy
import feedparser
import time


def collect_arxiv(cats, limit=50, checkpoint=None):
    """Collect arXiv papers by category using feedparser.
    
    Params
    ----------
    cats: list[str]
        List of categories in arXiv format e.g. cs.AI for artificial intelligence.
    limit: int, default = 50
        Total number of papers to be retrieved. Should be a multiple of 10. 
        
    checkpoint : str, default = None
        URL of the most recent paper for a particular query. To be used 
        in the case of repeated queries. Collection will terminate if the checkpoint
        is encountered.
        
    Returns
    ----------
    Tuple[list, str] : (List of dicts, checkpoint)
        Each dict refers to a paper. Checkpoint is the url from the most 
        recent paper retrieved."""
    
    base_url = 'http://export.arxiv.org/api/query?search_query='
    params = '&sortBy=lastUpdatedDate&sortOrder=descending'
    
    cat_concat = '+OR+'.join(['cat:'+cat for cat in cats])
    
    # Defaults
    start = 0
    page_size = 10
    crawl_delay = 0
    entries = []
    new_checkpoint = checkpoint
    
    for i in range(start, limit, page_size):
        feed = feedparser.parse(base_url+cat_concat+params+f'&start={i}&max_results={page_size}')
        
        # Check for failed retrievals
        if not feed.bozo and feed.entries:
            for entry in feed.entries:
                # Early stopping condition
                if entry.link == checkpoint:
                    return entries, new_checkpoint
                # Build a dict from entry details
                entries.append({
                    'title': entry.title_detail.value,
                    'summary': entry.summary_detail.value,
                    'author': entry.author,
                    'url': entry.link,
                    'category': entry.arxiv_primary_category['term']
                })
            # Set a new checkpoint after first successful update
            if new_checkpoint == checkpoint:
                new_checkpoint = entries[0]['url']
                
        time.sleep(crawl_delay)
            
    return entries, new_checkpoint

In [166]:
# Function test case
my_cats = ['cs.*', 'econ.*', 'stat.ML']

results = collect_arxiv(my_cats, 30)

In [167]:
results[0][0]

{'title': 'Are We Really Making Much Progress in Text Classification? A Comparative\n  Review',
 'summary': 'This study reviews and compares methods for single-label and multi-label text\nclassification, categorized into bag-of-words, sequence-based, graph-based, and\nhierarchical methods. The comparison aggregates results from the literature\nover five single-label and seven multi-label datasets and complements them with\nnew experiments. The findings reveal that all recently proposed graph-based and\nhierarchy-based methods fail to outperform pre-trained language models and\nsometimes perform worse than standard machine learning methods like a\nmultilayer perceptron on a bag-of-words. To assess the true scientific progress\nin text classification, future work should thoroughly test against strong\nbag-of-words baselines and state-of-the-art pre-trained language models.',
 'author': 'Ansgar Scherp',
 'url': 'http://arxiv.org/abs/2204.03954v4',
 'category': 'cs.CL'}