# Update Harvests

In this notebook I show the process for updating our collection of __arXiv__ data using the OAI API. 

In particular we have two main things going on here:
1. Acquire new XML files to update our data, we can accomplish this by looking to the most recent update to our data and passing that date along in our API call and get every article since then.
1. Process our XML into JSON files, keeping track of exceptional records.

To this end we will reuse most of the code from the other two notebooks in this directory but wrap it all in one function that should figure out the date of the most recent call on it's own. We do this by accessing the file `harvest_info.txt` that I manually created with the date of the initial harvest and will update with the most recent harvest date when we run the code in this notebook. 


So right now the file reads 

>recent harvest  
>18-06-2018

and after running an update, say on June 29th 2018 will change it to read

>recent harvest  
>29-06-2018

In [1]:
import requests
import time
import datetime
import re
from bs4 import BeautifulSoup

#using the lxml library so you'll need that to run this. 

In [2]:
def timed_request(url, params, wait_time, max_tries):
    retry_counter = 0
    while retry_counter < max_tries:
        req = requests.get(url=url, params=params)
    
        if req.status_code == 200:
            req = BeautifulSoup(req.text, 'lxml-xml')
            return req
        
        else:
            retry_counter += 1
            time.sleep(wait_time)


def save_request(req, number, directory):
    req = str(req)
    
    with open(f'{directory}/{number}.xml', 'w') as file:
        file.write(req)
        
    return number + 1
    

In [11]:
def first_request(base_url, verb, metadata_prefix, wait_time, max_tries, from_date='2018-06-19'):
    params = {
        'verb':verb,
        'metadataPrefix':metadata_prefix,
        'from':from_date,
             }
    req = timed_request(url=base_url, params=params, wait_time=wait_time, max_tries=max_tries)
    return req
    

In [12]:
def log(message, log_file):
    with open(log_file, 'a') as file:
        file.write(message)

In [13]:
def copy_oai(base_url, verb, metadata_prefix=None, wait_time=10, max_tries=5,
             log_directory='../../data/oai_logs', save_directory='../../data/xml',
             resumption_token=None, request_num = 0, from_date = '2018-06-19'):
    
        
    log_name = str(datetime.datetime.now())
    log_name = re.sub('[^0-9]', '_', log_name)  
    log_file = f"{log_directory}/{log_name}.log"

    if not resumption_token:
        
        log_str = 'Making first request without resumption token\n'
        log(log_str, log_file)
        
        first_get = first_request(base_url=base_url, verb=verb, metadata_prefix=metadata_prefix, 
                                 wait_time=wait_time, max_tries=max_tries, from_date=from_date)


        if first_get:
            log_str = [f'First request SUCCESSFUL, using resumption tokens going forward.\n',
                       f'Saving current object at {request_num}.xml\n']
            log_str = ''.join(log_str)
            log(log_str, log_file)            

            request_num = save_request(first_get, request_num, save_directory)
            
            resumption_token = first_get.find('resumptionToken')
        
            if not resumption_token:
                log_str = 'No resumption token from first request, exiting.\n'
                log(log_str, log_file)            
                return
            
        else:
            log_str = 'First request failed, bailing out.\n'
            log(log_str, log_file)            
            return
    
    
    my_params = {
        'verb': verb,
        'resumptionToken': resumption_token
    }
    
    while my_params['resumptionToken']:
        
        if type(my_params['resumptionToken']) is not str:
            my_params['resumptionToken'] = my_params['resumptionToken'].text
        
        log_str = f'Time: {str(datetime.datetime.now())}, Resumption Token: {my_params["resumptionToken"]}\n'
        log(log_str, log_file)            

        time.sleep(wait_time)
        next_request =  timed_request(url=base_url, params=my_params,
                                  wait_time=wait_time, max_tries=max_tries)
        

        if next_request:
            log_str = [f'Request SUCCESSFUL using Resumption Token {my_params["resumptionToken"]}\n',
                       f'Saving current object at {request_num}.xml\n']
            log_str = ''.join(log_str)
            
            log(log_str, log_file)
            
            request_num = save_request(next_request, request_num, save_directory)
        else: 
            log_str = f'Request FAILED using Resumption Token {my_params["resumptionToken"]}\n'
            log(log_str, log_file)            

        my_params['resumptionToken'] = next_request.find('resumptionToken')


In [20]:
def update_harvest():
    #first to update our information we'll find the date we need to send as a
    #from paramaeter and to see where we should save everything
    
    today = str(datetime.date.today())
    
    log_directory = f'../../data/oai_logs/{today}'
    save_directory = f'../../data/xml/update_harvests/{today}'
    
    
    base_url = 'https://export.arxiv.org/oai2'
    verb = 'ListRecords'
    metadata_prefix = 'arXiv'
    
    
    copy_oai(base_url=base_url, verb=verb, metadata_prefix=metadata_prefix,
             log_directory=log_directory, save_directory=save_directory, from_date = from_date)
    

SyntaxError: invalid syntax (<ipython-input-20-8b184b3066c6>, line 17)