# Dev: Papers IDs collector


### References

- [arXiv.org](https://arxiv.org/)
- [arXiv API-Homepage](https://pypi.org/project/arxiv/)
- [arXiv API-Documentation](http://lukasschwab.me/arxiv.py/index.html)

In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime, date
import warnings
warnings.filterwarnings('ignore')

## arguments

In [21]:
required_categories = ['math.ST', 'stat.ME', 'stat.AP', 'stat.CO', 'cs.LG', 'stat.ML', 'cs.AI']
folder_output = 'datasets'

In [28]:
# url: Computer Science (cs)
url_cs = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=&terms-0-field=title&classification-computer_science=y&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size=200&order=-announced_date_first&start=0'

## functions

In [111]:
# check if two list of categories have any common element
def is_category(cat_paper:list, cat_required: list):
    cat_inter = list(set(cat_paper) & set(cat_required))
    return len(cat_inter) > 0


## parse papers information in a advanced query search
def parser_page(ulr:str, verbose:bool = False)->pd.DataFrame:
    # initialize
    col_x = ['paper_id', 'categories', 'submission_date', 'title', 'abstract']

    # download html content
    try:
        # get request
        reqs = requests.get(url)
        # get html page
        soup = BeautifulSoup(reqs.text, 'lxml')
    except Exception as e:
        if verbose:
            print(f'[error] It was not possible download the html content of this url: "{url}"')
            return pd.DataFrame(columns = col_x)

    # initialize
    records = list() 

    # loop of results
    for tag in soup.find_all("li", {"class": "arxiv-result"}):

        # parse categories
        tag_d = tag.find_all("div", {"class": "tags is-inline-block"})[0]
        categories = [it.rstrip().lstrip() for it in tag_d.text.split('\n') if it != '']
        if verbose:
            print("\n{0}: {1}".format(tag_d.name, categories))   

        # only continue if is a required paper by category
        if is_category(categories, required_categories):

            # parse paper id
            tag_a = tag.find_all("a")[0]
            paper_id = tag_a.text.replace('arXiv:', '').rstrip().lstrip()
            if verbose:
                print("{0}: {1}".format(tag_a.name, paper_id))

            # parse submission date
            tag_p = tag.find_all("p")[4]
            sdate = tag_p.text.split(';')[0].replace('Submitted', '').rstrip().lstrip()
            dt = datetime.strptime(sdate, '%d %B, %Y')
            submission_date = date(dt.year, dt.month, dt.day)
            if verbose:
                print("{0}: {1}".format(tag_p.name, submission_date))

            # parse title
            tag_p = tag.find_all("p")[1]
            title = tag_p.text.replace('\n','').rstrip().lstrip()
            if verbose:
                print("{0}: {1}".format(tag_p.name, title))

            # parse abstract
            tag_s = tag.find_all("span", {"class": "abstract-full has-text-grey-dark mathjax"})[0]
            abstract = tag_s.contents[0].replace('\n','').rstrip().lstrip()
            if verbose:
                print("{0}: {1}".format(tag_s.name, abstract))

            # append if any common cat
            records.append([paper_id, categories, submission_date, title, abstract])
        else:
            if verbose:
                print("discarted because is not required.")

    # store in a df
    df = pd.DataFrame(records, columns = col_x)
    if verbose:
        print(f'\nFinally was parsed {len(df)} papers.')

    # return
    return df


## get size of page
def get_page_size(url:str)->int:
    try:
        return int([iu for iu in url.split('&') if 'size' in iu][0].replace('size=', ''))
    except:
        print('[error] It is not available "size" tag in this url.')
        return None

    
## get next page
def next_paginate(url:str, size:int)->str:
    try:
        start = int([iu for iu in url.split('&') if 'start' in iu][0].replace('start=', ''))
        return url.replace(f'start={start}', f'start={start + size}')
    except:
        print('[error] It is not available "start" tag in this url.')
        return None

    
## check if url is valid
def is_valid_url(url:str)->bool:
    if 'size' in url and 'start=0' in url and '=all' in url and 'abstracts=show' in url and 'include_cross_list=include' in url:
        return True
    else:
        return False

## main

In [112]:
# select url
url = url_cs
# check if a valid url
if is_valid_url(url):
    print('It is a valid url.')

It is a valid url.


In [113]:
# maximum number of pages
max_num_pages = 2
# get page size
size = get_page_size(url)
# initialize
num_page = 1
# loop
while num_page <= max_num_pages:
    # parse
    idf = parser_page(url, verbose = False)
    # validate
    if len(idf)== 0:
        print('Stop loop!')
        break
    else:
        # append
        if num_page == 1:
            df = idf.copy()
        else:
            df = df.append(idf)
        # display
        print(f'--> Page {num_page} - total num records = {len(df)}')
        # clean
        del idf
        # get next page url
        url = next_paginate(url, size)
    # add counter
    num_page += 1

--> Page 1 - total num records = 75
--> Page 2 - total num records = 172


In [114]:
df.head()

Unnamed: 0,paper_id,categories,submission_date,title,abstract
0,2109.08141,"[cs.CV, cs.AI, cs.LG]",2021-09-16,An End-to-End Transformer Model for 3D Object ...,"We propose 3DETR, an end-to-end Transformer ba..."
1,2109.08139,"[eess.SP, cs.LG, cs.NI, stat.ML]",2021-09-16,Adversarial Attacks against Deep Learning Base...,We consider adversarial machine learning based...
2,2109.08134,"[cs.LG, stat.ML]",2021-09-16,Comparison and Unification of Three Regulariza...,"In batch reinforcement learning, there can be ..."
3,2109.08131,"[cs.HC, cs.CY, cs.LG]",2021-09-16,Studying Up Machine Learning Data: Why Talk Ab...,Research in machine learning (ML) has primaril...
4,2109.08128,"[cs.LG, cs.AI, cs.RO]",2021-09-16,Conservative Data Sharing for Multi-Task Offli...,Offline reinforcement learning (RL) algorithms...
