In [1]:
import requests
import time
import logging
import datetime
from bs4 import BeautifulSoup

#using the lxml library so you'll need that to run this. 

In [2]:
def timed_request(url, params, wait_time, max_tries):
    retry_counter = 0
    while retry_counter < max_tries:
        req = requests.get(url=url, params=params)
    
        if req.status_code == 200:
            req = BeautifulSoup(req.text, 'lxml-xml')
            return req
        
        else:
            time.sleep(wait_time)

def log_before(metadata_prefix=None, resumption_token=None):
    if resumption_token:
        return f'Time: {str(datetime.datetime.now())}, Resumption Token: {resumption_token}'
    else:
        return f'Time: {str(datetime.datetime.now())}, No Resumption Token provided, metadataPrefix {metadata_prefix}'

    
def save_request(req, number, directory):
    req = str(req)
    
    with open(f'{directory}/{number}.xml', 'w') as file:
        file.write(req)
        
    return number + 1
    

In [3]:
def copy_oai(base_url, verb, metadata_prefix=None, wait_time=11, max_tries=10,
             log_directory='../data/oai_logs', save_directory='../data/xml',
             resumption_token=None,):
    
    request_num = 0

    #on the first run, we issue no resumption token so 
    #we need to pass a metadata prefix and get a resumption token
    if not resumption_token:
        log_file = f'{log_directory}/oai.log'
        log_str = log_before(metadata_prefix=metadata_prefix)

        my_params = {
            'verb': verb,
            'metadataPrefix': metadata_prefix,
        }
        
        
        first_request = timed_request(url=base_url, params=my_params,
                                  wait_time=wait_time, max_tries=max_tries)    
    
        request_num = save_request(first_request, request_num, save_directory)
        
        
        del my_params['metadataPrefix']
        my_params['resumptionToken'] = first_request.find('resumptionToken')
        
        
    #if this is picking up after an error, we use a resumption token
    else:
        log_file = f'{log_directory}/oai_{resumption_token}.log'
        my_params = {
            'verb': verb,
            'resumptionToken': resumption_token,
        }
        

    logging.basicConfig(filename=log_file, level=logging.INFO)

    if log_str:
        logging.info(log_str)

    while my_params['resumptionToken']:

        my_params['resumptionToken'] = my_params['resumptionToken'].text
        
        log_str = log_before(resumption_token=my_params['resumptionToken'])
        logging.info(log_str)

        time.sleep(wait_time)
        next_request =  timed_request(url=base_url, params=my_params,
                                  wait_time=wait_time, max_tries=max_tries)
        

        if next_request:
            logging.info(f'Request SUCCESSFUL using Resumption Token {my_params["resumptionToken"]}')
            request_num = save_request(next_request, request_num, save_directory)

        else: 
            logging.info(f'Request FAILED using Resumption Token {my_params["resumptionToken"]}')
        
        
        #this might be a None if there is no resumption token. That's fine since that's when we want to stop 
        #our while loop anywyay
        my_params['resumptionToken'] = next_request.find('resumptionToken')
        


In [None]:
base_url = 'https://export.arxiv.org/oai2'
verb = 'ListRecords'
metadata_prefix = 'arXiv'
copy_oai(base_url=base_url, verb=verb, metadata_prefix=metadata_prefix)