## arXiv OAI

In [1]:
import requests

from bs4 import BeautifulSoup
#We will use the lxml library for BeautifulSoup so you'll need that to run this. 

from pathlib import Path

import time
import datetime



## Utility Functions

In [2]:
#The arxiv OAI API has some timeouts that can happen
#they want us to wait 10 seconds when this happens
#so I'm just hardcoding this here.

def timed_request(url, params, wait_time, max_tries):
    retry_counter = 0
    while retry_counter < max_tries:
        req = requests.get(url=url, params=params)
    
        if req.status_code == 200:
            req = BeautifulSoup(req.text, 'lxml-xml')
            return req
        else:
            retry_counter += 1
            time.sleep(wait_time)


In [3]:
#saving the XML obtained from the API to the disk

def save_request(req, number, directory):
    req = str(req)
    
    with open(f'{directory}/{number}.xml', 'w') as file:
        file.write(req)
        
    return number + 1
    

In [4]:
#we need to pass slightly different parameters when 
#making the first request, namely we have to pick the format 
#to return the data, this is the metadata_prefix

def first_request(base_url, verb, metadata_prefix, wait_time, max_tries):
    params = {
        'verb':verb,
        'metadataPrefix':metadata_prefix,
             }
    req = timed_request(url=base_url, params=params, wait_time=wait_time, max_tries=max_tries)
    return req
    

In [5]:
#a simple utility to write messages to a file
#mainly used here to check for resumption tokens

def log(message, log_file):
    with open(log_file, 'a') as file:
        file.write(message)

## Main Request Function

The function `copy_oai` will handle all of our requests automatically and save our XML files in `../../data/xml/initial_harvest_YYYY_MM_DD/`

In [6]:
def copy_oai(base_url, verb, metadata_prefix=None, wait_time=10, max_tries=5,
             log_directory='../../data/oai_logs', save_directory='../../data/xml',
             resumption_token=None, request_num = 0):
    
        
    #setting up a log file
    log_name = str(datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))
    log_file = f"{log_directory}/{log_name}.log"
    
    
    #the save directory probably doesn't exist yet!
    #we should make it
    Path(save_directory).mkdir(exist_ok=True)    

    
    
    #make the initial request assuming we aren't just using
    #a resumption token
    
    if not resumption_token:
        
        log_str = 'Making first request without resumption token\n'
        log(log_str, log_file)
        
        first_get = first_request(base_url=base_url, verb=verb, metadata_prefix=metadata_prefix, 
                                 wait_time=wait_time, max_tries=max_tries)


        if first_get:
            log_str = [f'First request SUCCESSFUL, using resumption tokens going forward.\n',
                       f'Saving current object at {request_num}.xml\n']
            log_str = ''.join(log_str)
            log(log_str, log_file)            

            request_num = save_request(first_get, request_num, save_directory)
            
            resumption_token = first_get.find('resumptionToken')
        
            if not resumption_token:
                log_str = 'No resumption token from first request, exiting.\n'
                log(log_str, log_file)            
                return
            
        else:
            log_str = 'First request failed, bailing out.\n'
            log(log_str, log_file)            
            return
    
    
    my_params = {
        'verb': verb,
        'resumptionToken': resumption_token
    }
    
    
    #start using resumption tokens we'll break out of this
    #loop once we get a response that has no resumption token
    
    while my_params['resumptionToken']:
        
        if type(my_params['resumptionToken']) is not str:
            my_params['resumptionToken'] = my_params['resumptionToken'].text
            
        log_time = str(datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))
        log_str = f'Time: {log_time}, Resumption Token: {my_params["resumptionToken"]}\n'
        log(log_str, log_file)            

        time.sleep(wait_time)
        next_request =  timed_request(url=base_url, params=my_params,
                                  wait_time=wait_time, max_tries=max_tries)
        

        if next_request:
            log_str = [f'Request SUCCESSFUL using Resumption Token {my_params["resumptionToken"]}\n',
                       f'Saving current object at {request_num}.xml\n']
            log_str = ''.join(log_str)
            
            log(log_str, log_file)
            
            request_num = save_request(next_request, request_num, save_directory)
        else: 
            log_str = f'Request FAILED using Resumption Token {my_params["resumptionToken"]}\n'
            log(log_str, log_file)            

        my_params['resumptionToken'] = next_request.find('resumptionToken')


I set this up so that if this process broke, I could look at the log function to find the `resumption_token` and `request_num` to resume the download without messing with the already downloaded files. None of this is hard coded so if you run this notebook, and none of the dependencies have changed in any meaningful way, you should end up with a complete copy of the arXiv metadata in the form of XML files.

In [None]:
base_url = 'https://export.arxiv.org/oai2'
verb = 'ListRecords'
metadata_prefix = 'arXiv'

today = str(datetime.date.today()).replace('-','_')
save_directory = f'../../data/xml/initial_harvest_{today}'

copy_oai(base_url=base_url, verb=verb, metadata_prefix=metadata_prefix, save_directory=save_directory)

At this point, if you find that the above process halted, you could look to the file `../../data/oai_log/YYYY_MM_DD_HH_mm_SS` (year, month, day hour, minute, second with a specified number of digits) to find the `resumption_token` that failed and `request_num` that it was. These can then be passed back to `copy_oai` to resume from that point rather than from the beginning!

In [None]:
# request_num = 
# resumption_token = 

# base_url = 'https://export.arxiv.org/oai2'
# verb = 'ListRecords'



# today = str(datetime.date.today()).replace('-','_')
# save_directory = f'../../data/xml/initial_harvest_{today}'



# copy_oai(base_url=base_url, verb=verb, save_directory=save_directory)