# Dev: Papers IDs collector


### References

- [arXiv.org](https://arxiv.org/)
- [arXiv API-Homepage](https://pypi.org/project/arxiv/)
- [arXiv API-Documentation](http://lukasschwab.me/arxiv.py/index.html)

In [61]:
import pandas as pd
import numpy as np
from datetime import datetime
import arxiv
import warnings
warnings.filterwarnings('ignore')

## arguments

In [62]:
month_i = datetime(2021,8,1)
month_f = datetime(2021,9,1)
required_categories = ['math.ST', 'stat.ME', 'stat.AP', 'stat.CO', 'cs.LG', 'stat.ML', 'cs.AI']

## functions

In [None]:
def get_ids(month_id:str):
    try:
        return next(arxiv.Search(id_list=[month_id]).results())
    except:
        return None

## IDs list to be parsed

In [107]:
# list of months to be collected
months = [f'{str(month_i.year)[2:]}{"%02d"%dt.month}' for dt in pd.date_range(start=month_i, end=month_f, freq='M').to_pydatetime().tolist()]
print(f'[info] Months to be collected = {len(months)}')
# list of ids to be parsed
ids = ["%05d"%i for i in np.arange(0, 100000,1)][:300]
print(f'[info] IDs to be collected per month = {len(ids)}')
# final list of month/ids to be parsed
months_ids = list(np.array([[f'{m}.{i}' for i in ids] for m in months]).ravel())
print(f'[info] Total number of Ids to be collected= {len(months_ids)}')

[info] Months to be collected = 1
[info] IDs to be collected per month = 300
[info] Total number of Ids to be collected= 300


## collect required IDs

In [108]:
data = [(paper.get_short_id(), paper.published) for paper in [get_ids(i) for i in months_ids] if not paper is None and len(list(set(paper.categories) & set(required_categories)))>0]
dfdata = pd.DataFrame(data, columns = ['short_id', 'publised_dt'])
dfdata.head()

Unnamed: 0,short_id,publised_dt
0,2108.00002v1,2021-07-29 18:45:10+00:00
1,2108.00003v1,2021-07-29 23:30:02+00:00
2,2108.00037v1,2021-07-30 18:43:14+00:00
3,2108.00043v1,2021-07-30 19:02:32+00:00
4,2108.00045v1,2021-07-30 19:08:44+00:00


## parelilize
https://stackoverflow.com/questions/9786102/how-do-i-parallelize-a-simple-python-loop