In [69]:
import better_exceptions
import ConfigParser
import os
import pandas as pd
import re

from datetime import datetime
from lxml import etree

In [70]:
config = ConfigParser.SafeConfigParser()
config.read('config.ini')

['config.ini']

In [71]:
def prepare(config):
    if not os.path.exists(config.get('DEFAULT', 'sandbox')):
        os.makedirs(config.get('DEFAULT', 'sandbox'))
        
    if not os.path.exists(config.get('DEFAULT', 'output')):        
        os.makedirs(config.get('DEFAULT', 'output'))

In [72]:
prepare(config)

In [73]:
def is_email_address_known(config, address):
    return config.has_section(address)

In [74]:
print(is_email_address_known(config, 'test@test.me'))
print(is_email_address_known(config, 'email@archive1.org'))

False
True


In [75]:
def save_attachment(config, address, attachment):
    sandbox = config.get('DEFAULT', 'sandbox')
    
    code = config.get(address, 'code')

    seq = 1
    if config.has_option(address, 'sequence'):
        seq = config.getint(address, 'sequence') + 1
    
    filename = os.path.join(sandbox, '{}_{}.xlsx'.format(code, seq))
    
    os.rename(attachment, filename)
    
    return seq, filename

In [76]:
def update_config(config, filename, address, seq):
    config.set(address, 'sequence', str(seq))
    
    with open(filename, 'wb') as f:
        config.write(f)

In [77]:
address = 'email@archive1.org'
attachment = 'draft_01.xlsx'

seq, data = save_attachment(config, address, attachment)

In [78]:
update_config(config, 'config.ini', address, seq)

In [79]:
collection = pd.read_excel(data, 'collection')
collection.drop('ARCHIVES AFRICA: COLLECTION DATA', axis=1).drop(0)
collection.head()

Unnamed: 0,ARCHIVES AFRICA: COLLECTION DATA,Unnamed: 1,Unnamed: 2,Unnamed: 3,Examples:,For more information:
0,Institution identifier*,<institution_id>,Example identifier,</institution_id>,To be supplied,https://www.accesstomemory.org/en/docs/2.4/use...
1,Repository*,<repository_name>,Example repository,</repository_name>,To be supplied,https://www.ica.org/en/isdiah-international-st...
2,Collection identifier*,<collection_id>,Example collection ID,</collection_id>,To be supplied,https://www.accesstomemory.org/en/docs/2.4/use...
3,Title*,<title>,Example title,</title>,To be supplied,https://www.accesstomemory.org/en/docs/2.4/use...
4,Collection creation date*,<date_creation_collection>,Example collection creation date,</date_creation_collection>,To be supplied,https://www.accesstomemory.org/en/docs/2.4/use...


In [15]:
collection = collection.transpose()
new_header = collection.iloc[0].str.strip()
collection = collection[1:]
collection.columns = new_header
# collection.columns = collection.columns.str.strip()
collection.drop(['* Required'], axis=1, inplace=True)
collection.head()

ARCHIVES AFRICA: COLLECTION DATA,Institution identifier*,Repository*,Collection identifier*,Title*,Collection creation date*,Record creation date,Record revision date,Record deletion date,Level of description*,Extent and medium*,...,Scope and content*,Accruals,System of arrangement*,Conditions governing access*,Conditions governing reproduction,Language of material*,Finding aids,Related units of description,Notes (PUBLIC): Please note: These notes WILL be publicaly viewable.,Archivist's notes (PRIVATE): Please include name of archivist. These notes will NOT be publicaly viewable.
Unnamed: 1,<institution_id>,<repository_name>,<collection_id>,<title>,<date_creation_collection>,<date_creation_record>,<date_revision_record>,<date_deletion_record>,<level_of_description>,<extent_medium>,...,<scope_content>,<accruals>,<arrangement>,<conditions_access>,<conditions_reproduction>,<language_material>,<finding_aids>,<related_descriptions>,<notes_public>,<notes_private>
Unnamed: 2,Example identifier,Example repository,Example collection ID,Example title,Example collection creation date,2018-03-19 00:00:00,2018-03-20 00:00:00,2018-03-21 00:00:00,Example level of description,Example extent,...,Example scope and content. Multi-line data - m...,Example accruals. Multi-line data - multi-para...,Example system of arrangement,Example conditions governing access,Example conditions governing reproduction,Example language of materials,Example finding aids,Example related units of description,Example public notes. Multi-line data - multi-...,Example private notes. Multi-line data - multi...
Unnamed: 3,</institution_id>,</repository_name>,</collection_id>,</title>,</date_creation_collection>,</date_creation_record>,</date_revision_record>,</date_deletion_record>,</level_of_description>,</extent_medium>,...,</scope_content>,</accruals>,</arrangement>,</conditions_access>,</conditions_reproduction>,</language_material>,</finding_aids>,</related_descriptions>,</notes_public>,</notes_private>
Examples:,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,...,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied
For more information:,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.ica.org/en/isdiah-international-st...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,,,,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,


In [16]:
collection['Institution identifier*'][1], collection['Repository*'][1]

('Example identifier', 'Example repository')

In [17]:
def get_missing_fields(collection):
    missing_fields = []

    for c in collection.columns:
        # required fields
        if '*' in c:
            data = collection[c][1]
            if pd.isna(data) or pd.isnull(data):
                missing_fields.append(c)
    
    return missing_fields

In [18]:
missing_fields = get_missing_fields(collection)
if missing_fields:
    print(missing_fields)

In [19]:
def collection_to_xml(collection):
    xml = etree.Element('collection')
    
    for c in collection.columns:
        data = collection[c][1]
        
        if not pd.isna(data) and not pd.isnull(data):
            name = collection[c][0]
            name = re.sub(r'\W', '', name)
            el = etree.SubElement(xml, name)
            
            if isinstance(data, str):
                paras = data.split('\n')
                
                for para in paras:
                    para = para.strip()
                    if para:
                        p = etree.SubElement(el, 'p')
                        if '<' in para or '>' in para:
                            p.text = etree.CDATA(para)
                        else:
                            p.text = para
            elif isinstance(data, datetime):
                data = data.date().isoformat()
                el.text = data
            else:
                el.text = data
    
    return etree.ElementTree(xml)

In [20]:
def terms_to_xml(terms, root):
    xml = etree.Element(root)
    
    for term in terms:
        if not pd.isna(term) and not pd.isnull(term):
            p = etree.SubElement(xml, 'p')
            p.text = term
            
    return etree.ElementTree(xml)

In [21]:
xml = collection_to_xml(collection)
# etree.dump(xml.getroot())
xml.write('collection.xml', encoding='utf-8', method='xml', pretty_print=True)

In [22]:
excel = pd.ExcelFile(data)
for sn in excel.sheet_names[1:]:
    df = pd.read_excel(data, sn)
    term_name = df.columns[0]
    el_name = term_name.lower().replace(' ', '_')
    
    xml = terms_to_xml(df[term_name], el_name)
#     etree.dump(xml.getroot())
    xml.write('{}.xml'.format(el_name), encoding='utf-8', method='xml', pretty_print=True)