In [None]:
# This script is designed to output a csv of selected fields taken from an XML file
# It creates a list of each type of field, combines the lists into a dataframe, and saves as CSV
# To refine this script I would recommend creating a single function that takes the desired field names
# Most of the inefficiency of this script is from repeating the search for selected fields within the XML
# Since the process has not been made into a function, it is not scalable at the moment

In [None]:
import xml.etree.ElementTree as ET
import re
import numpy
import pandas as pd

In [None]:
tree = ET.parse('ZenodoSoftwareGithub4-7-2015.xml')
root = tree.getroot()

In [None]:
# title
for parent in root.findall('{http://www.openarchives.org/OAI/2.0/}ListRecords'):
    titles = []
    for record in parent.findall('{http://www.openarchives.org/OAI/2.0/}record'):
        titles.append(record.find(".//{http://datacite.org/schema/kernel-3}title").text)

In [None]:
# record number
print(len(titles))
recordnum = range(1,len(titles)+1)

In [None]:
# number of collaborators
for parent in root.findall('{http://www.openarchives.org/OAI/2.0/}ListRecords'):
    numofcreators = []
    for record in parent.findall('{http://www.openarchives.org/OAI/2.0/}record'):
        numofcreators.append(len(record.findall(".//{http://datacite.org/schema/kernel-3}creatorName")))

In [None]:
# date issued
for parent in root.findall('{http://www.openarchives.org/OAI/2.0/}ListRecords'):
    issuedates = []
    for record in parent.findall('{http://www.openarchives.org/OAI/2.0/}record'):
        issuedates.append(record.find(".//{http://datacite.org/schema/kernel-3}date").text)

In [None]:
# creator names
for parent in root.findall('{http://www.openarchives.org/OAI/2.0/}ListRecords'):
    creatornames = []
    for record in parent.findall('{http://www.openarchives.org/OAI/2.0/}record'):
        recordcreators = []
        for eachcreator in record.findall(".//{http://datacite.org/schema/kernel-3}creatorName"):
            recordcreators.append(eachcreator.text)
        creatornames.append(', '.join(recordcreators))

In [None]:
# subject
for parent in root.findall('{http://www.openarchives.org/OAI/2.0/}ListRecords'):
    subjects = []
    for record in parent.findall('{http://www.openarchives.org/OAI/2.0/}record'):
        recordsubjects = []
        if record.findall(".//{http://datacite.org/schema/kernel-3}subject"):
            for eachsubject in record.findall(".//{http://datacite.org/schema/kernel-3}subject"):
                recordsubjects.append(eachsubject.text)
            subjects.append(', '.join(recordsubjects))
        else:
            subjects.append('no subject')

In [None]:
# rights
for parent in root.findall('{http://www.openarchives.org/OAI/2.0/}ListRecords'):
    rights1 = []
    rights2 = []
    for record in parent.findall('{http://www.openarchives.org/OAI/2.0/}record'):
        recordrights = []
        for eachright in record.findall(".//{http://datacite.org/schema/kernel-3}rights"):
            recordrights.append(eachright.text)
        rights1.append(recordrights[0])
        if len(recordrights) == 2:
            rights2.append(recordrights[1])
        elif len(recordrights) == 1:
            rights2.append('-')

In [None]:
# url
for parent in root.findall('{http://www.openarchives.org/OAI/2.0/}ListRecords'):
    urls = []
    for record in parent.findall('{http://www.openarchives.org/OAI/2.0/}record'):
        recordurls = []
        for relatedidentifier in record.findall(".//{http://datacite.org/schema/kernel-3}relatedIdentifier[@relatedIdentifierType='URL']"):
                identifierstring = relatedidentifier.text
                results = re.search(r'github', identifierstring)
                if results:
                    recordurls.append(identifierstring)
        urls.append(', '.join(recordurls))
        #if len(recordurls) > 1:
        #    print recordurls

In [None]:
# zenodo url
for parent in root.findall('{http://www.openarchives.org/OAI/2.0/}ListRecords'):
    zenodo = []
    for record in parent.findall('{http://www.openarchives.org/OAI/2.0/}record'):
        #recordurls = []
        for alternateidentifier in record.findall(".//{http://datacite.org/schema/kernel-3}alternateIdentifier"):
            identifierstring = alternateidentifier.text
            #recordurls.append(identifierstring)
            zenodo.append(identifierstring)

In [None]:
outputarray = numpy.column_stack((recordnum, titles, numofcreators, creatornames, issuedates, subjects, rights1, rights2, urls, zenodo))

In [None]:
df = pd.DataFrame(outputarray)

In [None]:
df.to_csv('ZenodoSoftwareGithubData4-7-2015.csv', encoding='utf-8')