# Setup

See:
https://github.com/karpathy/arxiv-sanity-preserver

Package does much we need, 
but we would lack internal understanding,
until we break it down.

In [2]:
import arxiv
import json
import pandas as pd

from os import listdir
from os.path import isfile, join

In [3]:
# Paths
DATA_PATH = '../data/'
OUTPUT_PATH = '../data/raw/'
CREDS_PATH = '../collection/credentials/'

# Basic search
IMO use search below, this seems to artificially truncate

In [None]:
QUERY = "deepfake"
N =  2000

In [None]:
# sort_by : relevance, lastUpdatedDate, or submittedDate
# max_results : large limit

search = arxiv.Search(
  query = QUERY,
  max_results = N, 
  sort_by = arxiv.SortCriterion.SubmittedDate 
)


In [None]:
for key in search.__dict__.keys():
      print(key)

In [None]:
# Quick results check, can still be long to run with many results

for result in search.results():
  print(result.title + '\n')

In [None]:
# get all data, not recommended

for result in search.results():     
      for key, value in result._raw.items(): #__dict__.items(): ### this is usual way, but here dict has raw
            print(key)
            print(value)
            print('\n')

In [None]:
# Save data to 

# overwrite 'w' or append 'a'
action = 'a'

#save as 
save_as = 'arxiv_' + QUERY

for result in search.results():
      with open (OUTPUT_PATH + save_as + '.jsonl', action) as f:
            json.dump(result._raw, f, default=str) # use raw as __dict__ has raw in it, thus more data
            f.write('\n')  

### Client search (for larger searches)

In [None]:
friendly_client = arxiv.Client(
  page_size = 1000, # 2000 max
  delay_seconds = 10, #3 min
  num_retries = 5
)

action = 'a'

#save as 
save_as = 'arxiv_' + QUERY

# Prints 1000 titles before needing to make another request.
for result in friendly_client.results(arxiv.Search(query=QUERY, sort_by = arxiv.SortCriterion.SubmittedDate)):
      with open (OUTPUT_PATH + save_as + '.jsonl', action) as f:
            json.dump(result._raw, f, default=str) # use raw as __dict__ has raw in it, thus more data
            f.write('\n')

# Start here for loading existing data

In [16]:
from datetime import datetime

In [4]:
# Check files in data folder
datafiles = [f for f in listdir(OUTPUT_PATH) if isfile(join(OUTPUT_PATH, f))]
print(datafiles)

['arxiv_GAN.jsonl', 'arxiv_GPT-3.jsonl', 'arxiv_fake news.jsonl', 'arxiv_disinformation.jsonl']


In [21]:
# Load df
load_file = datafiles[1]

df = pd.read_json(OUTPUT_PATH + load_file, convert_dates=True, lines=True, orient='records')

In [22]:
df.columns

Index(['id', 'guidislink', 'link', 'updated', 'updated_parsed', 'published',
       'published_parsed', 'title', 'title_detail', 'summary',
       'summary_detail', 'authors', 'author_detail', 'author', 'arxiv_comment',
       'links', 'arxiv_primary_category', 'tags', 'arxiv_journal_ref',
       'arxiv_doi', 'arxiv_affiliation'],
      dtype='object')

In [23]:
df.tail(2)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,authors,author_detail,author,arxiv_comment,links,arxiv_primary_category,tags,arxiv_journal_ref,arxiv_doi,arxiv_affiliation
134,http://arxiv.org/abs/2005.14165v4,True,http://arxiv.org/abs/2005.14165v4,2020-07-22T19:47:17Z,"[2020, 7, 22, 19, 47, 17, 2, 204, 0]",2020-05-28T17:29:03Z,"[2020, 5, 28, 17, 29, 3, 3, 149, 0]",Language Models are Few-Shot Learners,"{'type': 'text/plain', 'language': None, 'base...",Recent work has demonstrated substantial gains...,...,"[{'name': 'Tom B. Brown'}, {'name': 'Benjamin ...",{'name': 'Dario Amodei'},Dario Amodei,40+32 pages,"[{'href': 'http://arxiv.org/abs/2005.14165v4',...","{'term': 'cs.CL', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",,,
135,http://arxiv.org/abs/2002.04013v3,True,http://arxiv.org/abs/2002.04013v3,2020-10-21T16:36:55Z,"[2020, 10, 21, 16, 36, 55, 2, 295, 0]",2020-02-10T18:39:25Z,"[2020, 2, 10, 18, 39, 25, 0, 41, 0]",Towards Crowdsourced Training of Large Neural ...,"{'type': 'text/plain', 'language': None, 'base...",Many recent breakthroughs in deep learning wer...,...,"[{'name': 'Max Ryabinin'}, {'name': 'Anton Gus...",{'name': 'Anton Gusev'},Anton Gusev,Advances in Neural Information Processing Syst...,"[{'href': 'http://arxiv.org/abs/2002.04013v3',...","{'term': 'cs.DC', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.DC', 'scheme': 'http://arxiv.org...",Advances in Neural Information Processing Syst...,,


In [24]:
type(df.published[1])

str

In [25]:
def str_to_datetime(x):
      y = datetime.strptime(x,'%Y-%m-%dT%H:%M:%SZ')
      return y
#strptime

In [27]:
df['published2'] = df['published'].apply(lambda x: str_to_datetime(x))


In [26]:
df.published

0      2022-04-05T03:29:26Z
1      2022-03-29T13:38:03Z
2      2022-03-16T05:56:08Z
3      2022-03-15T11:06:54Z
4      2022-03-07T15:37:35Z
               ...         
131    2020-09-11T18:57:36Z
132    2020-09-07T17:59:25Z
133    2020-08-14T08:23:21Z
134    2020-05-28T17:29:03Z
135    2020-02-10T18:39:25Z
Name: published, Length: 136, dtype: object

In [28]:
df.published2

0     2022-04-05 03:29:26
1     2022-03-29 13:38:03
2     2022-03-16 05:56:08
3     2022-03-15 11:06:54
4     2022-03-07 15:37:35
              ...        
131   2020-09-11 18:57:36
132   2020-09-07 17:59:25
133   2020-08-14 08:23:21
134   2020-05-28 17:29:03
135   2020-02-10 18:39:25
Name: published2, Length: 136, dtype: datetime64[ns]

In [29]:
df.columns

Index(['id', 'guidislink', 'link', 'updated', 'updated_parsed', 'published',
       'published_parsed', 'title', 'title_detail', 'summary',
       'summary_detail', 'authors', 'author_detail', 'author', 'arxiv_comment',
       'links', 'arxiv_primary_category', 'tags', 'arxiv_journal_ref',
       'arxiv_doi', 'arxiv_affiliation', 'published2'],
      dtype='object')

In [31]:
df = df[['title', 'links', 'published2']]


In [32]:
df

Unnamed: 0,title,links,published2
0,Data Augmentation for Intent Classification wi...,"[{'href': 'http://arxiv.org/abs/2204.01959v1',...",2022-04-05 03:29:26
1,Training Compute-Optimal Large Language Models,"[{'href': 'http://arxiv.org/abs/2203.15556v1',...",2022-03-29 13:38:03
2,Thinking about GPT-3 In-Context Learning for B...,"[{'href': 'http://arxiv.org/abs/2203.08410v1',...",2022-03-16 05:56:08
3,The Ghost in the Machine has an American accen...,"[{'href': 'http://arxiv.org/abs/2203.07785v1',...",2022-03-15 11:06:54
4,Tensor Programs V: Tuning Large Neural Network...,"[{'href': 'http://arxiv.org/abs/2203.03466v2',...",2022-03-07 15:37:35
...,...,...,...
131,Unit Test Case Generation with Transformers an...,"[{'href': 'http://arxiv.org/abs/2009.05617v2',...",2020-09-11 18:57:36
132,Measuring Massive Multitask Language Understan...,"[{'href': 'http://arxiv.org/abs/2009.03300v3',...",2020-09-07 17:59:25
133,Language Models as Few-Shot Learner for Task-O...,"[{'href': 'http://arxiv.org/abs/2008.06239v2',...",2020-08-14 08:23:21
134,Language Models are Few-Shot Learners,"[{'href': 'http://arxiv.org/abs/2005.14165v4',...",2020-05-28 17:29:03


In [None]:
new_df = df[df.published >= last_collected]
