In [1]:
from lxml import etree

import os
from pathlib import Path

import json
import re

We'll build a json with the following structure
```
{
    'responseDate':'2018-06-19T02:56:50Z',
    'request': 'http://export.arxiv.org/oai2'
    'ListRecords':
        [
          record_0,
          record_1,
          ...
          record_999
        ],
}
```

Then each of the `record_i` is itself a json with all the infomation for that record.

In [2]:
def header_parse(header):
    new_header = {}
    for child in header.getchildren():
        if child.tag in new_header:
            new_header[child.tag].append(child.text)
        else:
            new_header[child.tag] = [child.text]

    return new_header

def metadata_parse(metadata):
    new_meta = {}
    for child in metadata.getchildren()[0].getchildren():
        if (child.tag in new_meta) and (child.tag[-7:] != 'authors'):
            new_meta[child.tag].append(child.text)
        elif (child.tag not in new_meta) and (child.tag[-7:] != 'authors'):
            new_meta[child.tag] = [child.text]
        else:
            
            #in this case child has each authors data as children, so we'll pass it to the authors_parse
            #function to handle. This can be a little annoyting because not everything is standardized
            
            authors = authors_parse(child)
            new_meta[child.tag] = authors

    return new_meta


def authors_parse(authors):
    authors = authors.getchildren()
    new_author_data = []
    
    for author in authors:
        one_author = {}
        for datum in author.getchildren():
            one_author[datum.tag] = datum.text
    
        new_author_data.append(one_author)
        
    return new_author_data
    
    

In [4]:
def convert_arXiv_xml_json(xml_tree, exceptions=[]):

    responseDate, request, ListRecords = xml_tree.getchildren()
    arxiv_json = {
        'responseDate': responseDate.text,
        'request': request.text,
        'ListRecords':[]
    }


    #the final obejct in ListRecords is not a real record but a resumption token which we don't really care about
    for record in ListRecords[:-1]:
        
#       At least one of these records gets passed without any metadata :(, in the event that this happens
#       we will just skip that entry

        try:
            header, metadata = record.getchildren()
        except:
            exceptions.append(record)

        #this splits the header into pieces, which as far as I can tell have no children
        #these are just the identifier, the datestamp and setspec
        #since the setspec at least can have multiple values I'm going to maybe be overlly cautious here about 
        #keeping track of all of this header data
        try:
            header = header_parse(header)
            metadata = metadata_parse(metadata)
        
        except:
            pass
        record_json = {
            'header':header,
            'metadata':metadata,
        }

        arxiv_json['ListRecords'].append(record_json)
    
    return arxiv_json, exceptions



1. Open XML file, close after indented work
> create a tree object
1. create json from the tree
1. save the json to disk

In [5]:
def process_xmls(xml_directory='../../data/xml/initial_harvest_18_06_2018',
                 json_directory='../../data/json/initial_harvest_18_06_2018',
                 log_directory='../../data/'
                ):
    
    #creates a list of strings of everything in xml_directory
    xmls = os.listdir(xml_directory)
    
    #in case there's some hidden files running around in xml_directory, 
    #the original download was structured in such a way that we can easily pattern match the 
    #file names to make sure we're only looking at the files we want
    # ALl of these files are just a string of 
    
    #specifically there is a .ipynb_checkpoint directory in here that gets CREATED while running this script in 
    #a jupyter notebook. I guess that shouldn't come up as a huge problem, but for reusability, and my own sanity
    #I'm going to just account for it here. 
    xmls = [file_name for file_name in xmls if re.match('\d*.xml$', file_name)]

    
    exceptions = []
    
    for xml_file_name in xmls:
        xml_path = f'{xml_directory}/{xml_file_name}'
        
        with open(xml_path, 'r') as xml_file:
            xml_tree = etree.parse(xml_file).getroot()
        
        json_version, exceptions = convert_arXiv_xml_json(xml_tree, exceptions)
        
        json_file_name = xml_file_name.split('.')[0]
        json_file_name = f'{json_file_name}.json'
        
        json_path = f'{json_directory}/{json_file_name}'
        
        with open(json_path, 'w') as json_file:
            json.dump(json_version, json_file)
    return exceptions

In [6]:
exceptions = process_xmls()

The exceptions list should contain the records that didn't have both head data and metadata, there were only 11 such records and they all are deleted papers (because it was a duplicate of another in the arXiv). For example look at the `id` number below, and then follow [this link](https://arxiv.org/abs/1105.2364) to look up that paper.

In [7]:
len(exceptions)

11

In [90]:
exceptions[0].getchildren()[0].getchildren()[0].text

'oai:arXiv.org:1105.2364'

I'll keep all of the record json files that don't match the style we expect in `../../data/json/exceptional_records` as a file `date_of_acquisition.json`. When I set up a harvester program to do this for the incremental updates I'll store the future records of this type (I don't think there ever will be any) in there in the same way. 

In [81]:
exception_strs = [etree.tostring(record) for record in exceptions]
exception_strs = [ str(rec) for rec in exception_strs]
exc_json = '../../data/json/exceptional_records/18_06_2018.json'

with open(exc_json, 'w') as file:
    json.dump(exception_strs_, file)