# Dev: Papers IDs collector


### References

- [arXiv.org](https://arxiv.org/)
- [arXiv API-Homepage](https://pypi.org/project/arxiv/)
- [arXiv API-Documentation](http://lukasschwab.me/arxiv.py/index.html)

In [6]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import arxiv
from datetime import datetime
import ray
import warnings
warnings.filterwarnings('ignore')

## arguments

In [33]:
month_i = datetime(2021,8,1)
month_f = datetime(2021,9,1)
required_categories = ['math.ST', 'stat.ME', 'stat.AP', 'stat.CO', 'cs.LG', 'stat.ML', 'cs.AI']
folder_output = 'datasets'

## functions

In [34]:
def get_ids(month_id:str):
    try:
        return next(arxiv.Search(id_list=[month_id]).results())
    except:
        return None

## IDs list to be parsed

In [35]:
# list of months to be collected
months = [f'{str(month_i.year)[2:]}{"%02d"%dt.month}' for dt in pd.date_range(start=month_i, end=month_f, freq='M').to_pydatetime().tolist()]
print(f'[info] Months to be collected = {len(months)}')
# list of ids to be parsed
ids = ["%05d"%i for i in np.arange(0, 100000,1)][:100]
print(f'[info] IDs to be collected per month = {len(ids)}')

[info] Months to be collected = 1
[info] IDs to be collected per month = 100


## collect required IDs

In [39]:
# loop of months
for month in months[::-1]:
    # display
    print(f'--> collecting month "{month}"...')

    ## init ray
    ray.init()

    @ray.remote
    def get_ids(month_id:str):
        try:
            return next(arxiv.Search(id_list=[month_id]).results())
        except:
            return None

    @ray.remote
    def return_out(paper):
        return (paper.get_short_id(), paper.published)

    # init time counter
    tic = datetime.now()
    # collect papers ids
    papers = ray.get([get_ids.remote(f'{month}.{i}') for i in ids])
    outs = [return_out.remote(paper) for paper in papers if not paper is None and len(list(set(paper.categories) & set(required_categories)))>0]
    # store in df
    dfdata = pd.DataFrame(ray.get(outs), columns = ['short_id', 'publised_dt'])
    # end time counter and estimate diference
    toc = datetime.now()
    tictoc = ((toc-tic).seconds)/60. # minutes
    # display
    print(f"[info] Process time spent {tictoc} minutes")
    # save output
    path_output = os.path.join(folder_output, f'table-id_papers_ia-{month}.csv')
    #dfdata.to_csv(path_output)
    print(f"[info] It was save {dfdata.shape[0]} records.")
    # shutdown ray
    ray.shutdown()

--> collecting month "2108"...
[info] Process time spent 0.45 minutes
[info] It was save 19 records.
