# Update Harvests

In this notebook I show the process for updating our collection of __arXiv__ data using the OAI API. 

In particular we have two main things going on here:
1. Acquire new XML files to update our data, we can accomplish this by looking to the most recent update to our data and passing that date along in our API call and get every article since then.
1. Process our XML into JSON files, keeping track of exceptional records.

To this end we will reuse most of the code from the other two notebooks in this directory but wrap it all in one function that should figure out the date of the most recent call on it's own. We do this by accessing the file `harvest_info.txt` that I manually created with the date of the initial harvest and will update with the most recent harvest date when we run the code in this notebook. 

So right now the file reads 

>first harvest  
>18-06-2018

and after running an update, say on June 29th 2018 will change it to read

>first harvest  
>2018-06-18  
>update harvest  
>2018-06-29 

__You should be able to just run every cell in this notebook to update the XML and JSON files. HOWEVER, the required files are not actually present in the repo because they take up too much space. If you'd like to check this out yourself, feel free to email me.__

In [1]:
import requests
import time
import datetime
import os
import re
from bs4 import BeautifulSoup

from lxml import etree
import json

from pathlib import Path

#using the lxml library so you'll need that to run this. 

In [2]:
def timed_request(url, params, wait_time, max_tries):
    retry_counter = 0
    while retry_counter < max_tries:
        req = requests.get(url=url, params=params)
    
        if req.status_code == 200:
            req = BeautifulSoup(req.text, 'lxml-xml')
            return req
        
        else:
            retry_counter += 1
            time.sleep(wait_time)


def save_request(req, number, directory,):
    req = str(req)
    
    with open(f'{directory}/{number}.xml', 'w') as file:
        file.write(req)
        
    return number + 1
    

In [3]:
def first_request(base_url, verb, metadata_prefix, wait_time, max_tries, from_date='2018-06-19'):
    params = {
        'verb':verb,
        'metadataPrefix':metadata_prefix,
        'from':from_date,
             }
    req = timed_request(url=base_url, params=params, wait_time=wait_time, max_tries=max_tries)
    return req
    

In [4]:
def log(message, log_file):
    with open(log_file, 'a') as file:
        file.write(message)

In [5]:
def copy_oai(base_url, verb, metadata_prefix=None, wait_time=10, max_tries=5,
             log_directory='../../data/oai_logs', save_directory='../../data/xml',
             resumption_token=None, request_num = 0, from_date = '2018-06-19', today=None):
    
        
    log_name = str(datetime.datetime.now())
    log_name = re.sub('[^0-9]', '_', log_name)  
    log_file = f"{log_directory}/{log_name}.log"

    if not resumption_token:
        
        log_str = 'Making first request without resumption token\n'
        log(log_str, log_file)
        
        first_get = first_request(base_url=base_url, verb=verb, metadata_prefix=metadata_prefix, 
                                 wait_time=wait_time, max_tries=max_tries, from_date=from_date)


        if first_get:
            log_str = [f'First request SUCCESSFUL, using resumption tokens going forward.\n',
                       f'Saving current object at {request_num}.xml\n']
            log_str = ''.join(log_str)
            log(log_str, log_file)            

            request_num = save_request(first_get, request_num, save_directory)
            
            resumption_token = first_get.find('resumptionToken')
        
            if not resumption_token:
                log_str = 'No resumption token from first request, exiting.\n'
                log(log_str, log_file)            
                return
            
        else:
            log_str = 'First request failed, bailing out.\n'
            log(log_str, log_file)            
            return
    
    
    my_params = {
        'verb': verb,
        'resumptionToken': resumption_token
    }
    
    while my_params['resumptionToken']:
        
        if type(my_params['resumptionToken']) is not str:
            my_params['resumptionToken'] = my_params['resumptionToken'].text
        
        log_str = f'Time: {str(datetime.datetime.now())}, Resumption Token: {my_params["resumptionToken"]}\n'
        log(log_str, log_file)            

        time.sleep(wait_time)
        next_request =  timed_request(url=base_url, params=my_params,
                                  wait_time=wait_time, max_tries=max_tries)
        

        if next_request:
            log_str = [f'Request SUCCESSFUL using Resumption Token {my_params["resumptionToken"]}\n',
                       f'Saving current object at {request_num}.xml\n']
            log_str = ''.join(log_str)
            
            log(log_str, log_file)
            
            request_num = save_request(next_request, request_num, save_directory)
        else: 
            log_str = f'Request FAILED using Resumption Token {my_params["resumptionToken"]}\n'
            log(log_str, log_file)            

        my_params['resumptionToken'] = next_request.find('resumptionToken')


In [6]:
def harvest_update():
    #first to update our information we'll find the date we need to send as a
    #from paramaeter and to see where we should save everything
    
    today = str(datetime.date.today())
    
    #create a directory to store the saved xmls
    save_directory = f'../../data/xml/update_harvests/{today}'
    
    #being overly cautious here maybe, checking that the directory isn't there before we make it
    #this is to avoid things being annoying if you have to run the process twice and Path.mkdir() fails
    # when the directory is present already
    if not os.path.isdir(save_directory):
        Path(save_directory).mkdir()
    
    #find the most recent harvest date
    with open('../../data/harvest_info.txt') as file:
        last_harvest_date = file.readlines()[-1].strip()
    
    base_url = 'https://export.arxiv.org/oai2'
    verb = 'ListRecords'
    metadata_prefix = 'arXiv'
    
    
    
    
    copy_oai(base_url=base_url, verb=verb, metadata_prefix=metadata_prefix,
             save_directory=save_directory, today=today, from_date=last_harvest_date)



    #update the harvest record
    with open('../../data/harvest_info.txt', 'a') as file:
        file.write(f'\nupdate harvest\n{today}\n')
    
    

In [7]:
harvest_update()


These XML files have been saved to `../../data/xml/update_harvests/YYYY-MM-DD/FILE_NUM.xml` where `FILE_NUM` is just the ordering of the requests.

## Now we process the XML files into JSON files.

Just like the XMLs we will keep track of the dates of the updates within the file system.

In [8]:
def header_parse(header):
    new_header = {}
    for child in header.getchildren():
        if child.tag in new_header:
            new_header[child.tag].append(child.text)
        else:
            new_header[child.tag] = [child.text]

    return new_header

def metadata_parse(metadata):
    new_meta = {}
    for child in metadata.getchildren()[0].getchildren():
        if (child.tag in new_meta) and (child.tag[-7:] != 'authors'):
            new_meta[child.tag].append(child.text)
        elif (child.tag not in new_meta) and (child.tag[-7:] != 'authors'):
            new_meta[child.tag] = [child.text]
        else:
            
            #in this case child has each authors data as children, so we'll pass it to the authors_parse
            #function to handle. This can be a little annoyting because not everything is standardized
            
            authors = authors_parse(child)
            new_meta[child.tag] = authors

    return new_meta


def authors_parse(authors):
    authors = authors.getchildren()
    new_author_data = []
    
    for author in authors:
        one_author = {}
        for datum in author.getchildren():
            one_author[datum.tag] = datum.text
    
        new_author_data.append(one_author)
        
    return new_author_data
    
    

In [9]:
def convert_arXiv_xml_json(xml_tree, exceptions=[]):

    responseDate, request, ListRecords = xml_tree.getchildren()
    arxiv_json = {
        'responseDate': responseDate.text,
        'request': request.text,
        'ListRecords':[]
    }


    #the final obejct in ListRecords is not a real record but a resumption token which we don't really care about
    for record in ListRecords[:-1]:
        
#       At least one of these records gets passed without any metadata :(, in the event that this happens
#       we will just skip that entry

        try:
            header, metadata = record.getchildren()
        except:
            exceptions.append(record)

        #this splits the header into pieces, which as far as I can tell have no children
        #these are just the identifier, the datestamp and setspec
        #since the setspec at least can have multiple values I'm going to maybe be overlly cautious here about 
        #keeping track of all of this header data
        try:
            header = header_parse(header)
            metadata = metadata_parse(metadata)
        
        except:
            pass
        record_json = {
            'header':header,
            'metadata':metadata,
        }

        arxiv_json['ListRecords'].append(record_json)
    
    return arxiv_json, exceptions



In [10]:
def process_xmls(xml_directory='../../data/xml/initial_harvest_18_06_2018',
                 json_directory='../../data/json/initial_harvest_18_06_2018',):
    
    #creates a list of strings of everything in xml_directory
    xmls = os.listdir(xml_directory)
    
    #in case there's some hidden files running around in xml_directory, 
    #the original download was structured in such a way that we can easily pattern match the 
    #file names to make sure we're only looking at the files we want
    # ALl of these files are just a string of 
    
    #specifically there is a .ipynb_checkpoint directory in here that gets CREATED while running this script in 
    #a jupyter notebook. I guess that shouldn't come up as a huge problem, but for reusability, and my own sanity
    #I'm going to just account for it here. 
    xmls = [file_name for file_name in xmls if re.match('\d*.xml$', file_name)]

    
    exceptions = []
    
    for xml_file_name in xmls:
        xml_path = f'{xml_directory}/{xml_file_name}'
        
        with open(xml_path, 'r') as xml_file:
            xml_tree = etree.parse(xml_file).getroot()
        
        json_version, exceptions = convert_arXiv_xml_json(xml_tree, exceptions)
        
        json_file_name = xml_file_name.split('.')[0]
        json_file_name = f'{json_file_name}.json'
        
        json_path = f'{json_directory}/{json_file_name}'
        
        with open(json_path, 'w') as json_file:
            json.dump(json_version, json_file)
    return exceptions

In [11]:
def process_update():
    today = str(datetime.date.today())
    
    direcs = {
        'xml_directory': f'../../data/xml/update_harvests/{today}',
        'json_directory': f'../../data/json/update_harvests/{today}',
    }
    
    for direc in direcs.values():
        if not os.path.isdir(direc):
            Path(direc).mkdir()

    
    
    exceptions = process_xmls(xml_directory=direcs['xml_directory'], json_directory=direcs['json_directory'])
    
    if exceptions:
        with open(f'../../data/json/exceptional_records/{today}.json', 'w') as file:
            json.dump(exceptions, file) 
        print('Found some exceptions, wild!')
    else:
        print('No exceptions! Seems like the update was fine.')

In [12]:
process_update()

No exceptions! Seems like the update was fine.
