In [102]:
from datetime import datetime

from lxml import etree
import pandas as pd
import re

In [103]:
data = 'draft_01.xlsx'

In [104]:
collection = pd.read_excel(data, 'collection')
collection.drop('ARCHIVES AFRICA: COLLECTION DATA', axis=1).drop(0)
collection.head()

Unnamed: 0,ARCHIVES AFRICA: COLLECTION DATA,Unnamed: 1,Unnamed: 2,Unnamed: 3,Examples:,For more information:
0,Institution identifier*,<institution_id>,Example identifier,</institution_id>,To be supplied,https://www.accesstomemory.org/en/docs/2.4/use...
1,Repository*,<repository_name>,Example repository,</repository_name>,To be supplied,https://www.ica.org/en/isdiah-international-st...
2,Collection identifier*,<collection_id>,Example collection ID,</collection_id>,To be supplied,https://www.accesstomemory.org/en/docs/2.4/use...
3,Title*,<title>,Example title,</title>,To be supplied,https://www.accesstomemory.org/en/docs/2.4/use...
4,Collection creation date*,<date_creation_collection>,Example collection creation date,</date_creation_collection>,To be supplied,https://www.accesstomemory.org/en/docs/2.4/use...


In [105]:
collection = collection.transpose()
new_header = collection.iloc[0].str.strip()
collection = collection[1:]
collection.columns = new_header
# collection.columns = collection.columns.str.strip()
collection.head()

ARCHIVES AFRICA: COLLECTION DATA,Institution identifier*,Repository*,Collection identifier*,Title*,Collection creation date*,Record creation date,Record revision date,Record deletion date,Level of description*,Extent and medium*,...,Accruals,System of arrangement*,Conditions governing access*,Conditions governing reproduction,Language of material*,Finding aids,Related units of description,Notes (PUBLIC): Please note: These notes WILL be publicaly viewable.,Archivist's notes (PRIVATE): Please include name of archivist. These notes will NOT be publicaly viewable.,* Required
Unnamed: 1,<institution_id>,<repository_name>,<collection_id>,<title>,<date_creation_collection>,<date_creation_record>,<date_revision_record>,<date_deletion_record>,<level_of_description>,<extent_medium>,...,<accruals>,<arrangement>,<conditions_access>,<conditions_reproduction>,<language_material>,<finding_aids>,<related_descriptions>,<notes_public>,<notes_private>,
Unnamed: 2,Example identifier,Example repository,Example collection ID,Example title,Example collection creation date,2018-03-19 00:00:00,2018-03-20 00:00:00,2018-03-21 00:00:00,Example level of description,Example extent,...,Example accruals. Multi-line data - multi-para...,Example system of arrangement,Example conditions governing access,Example conditions governing reproduction,Example language of materials,Example finding aids,Example related units of description,Example public notes. Multi-line data - multi-...,Example private notes. Multi-line data - multi...,
Unnamed: 3,</institution_id>,</repository_name>,</collection_id>,</title>,</date_creation_collection>,</date_creation_record>,</date_revision_record>,</date_deletion_record>,</level_of_description>,</extent_medium>,...,</accruals>,</arrangement>,</conditions_access>,</conditions_reproduction>,</language_material>,</finding_aids>,</related_descriptions>,</notes_public>,</notes_private>,
Examples:,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,...,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,To be supplied,
For more information:,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.ica.org/en/isdiah-international-st...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,,,,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,https://www.accesstomemory.org/en/docs/2.4/use...,,


In [106]:
collection['Institution identifier*'][1], collection['Repository*'][1]

('Example identifier', 'Example repository')

In [107]:
def get_missing_fields(collection):
    missing_fields = []

    for c in collection.columns:
        # required fields
        if '*' in c:
            data = collection[c][1]
            if pd.isna(data) or pd.isnull(data):
                missing_fields.append(c)
    
    return missing_fields

In [108]:
missing_fields = get_missing_fields(collection)
if missing_fields:
    print(missing_fields)

['* Required']


In [128]:
def collection_to_xml(collection):
    xml = etree.Element('collection')
    
    for c in collection.columns:
        data = collection[c][1]
        
        if not pd.isna(data) and not pd.isnull(data):
            name = collection[c][0]
            name = re.sub(r'\W', '', name)
            el = etree.SubElement(xml, name)
            
            if isinstance(data, str):
                paras = data.split('\n')
                
                for para in paras:
                    para = para.strip()
                    if para:
                        p = etree.SubElement(el, 'p')
                        if '<' in para or '>' in para:
                            p.text = etree.CDATA(para)
                        else:
                            p.text = para
            elif isinstance(data, datetime):
                data = data.date().isoformat()
                el.text = data
            else:
                el.text = data
    
    return etree.ElementTree(xml)

In [143]:
def terms_to_xml(terms, root):
    xml = etree.Element(root)
    
    for term in terms:
        if not pd.isna(term) and not pd.isnull(term):
            p = etree.SubElement(xml, 'p')
            p.text = term
            
    return etree.ElementTree(xml)

In [141]:
xml = collection_to_xml(collection)
etree.dump(xml.getroot())
xml.write('collection.xml', encoding='utf-8', method='xml', pretty_print=True)

<collection>
  <institution_id>
    <p>Example identifier</p>
  </institution_id>
  <repository_name>
    <p>Example repository</p>
  </repository_name>
  <collection_id>
    <p>Example collection ID</p>
  </collection_id>
  <title>
    <p>Example title</p>
  </title>
  <date_creation_collection>
    <p>Example collection creation date</p>
  </date_creation_collection>
  <date_creation_record>2018-03-19</date_creation_record>
  <date_revision_record>2018-03-20</date_revision_record>
  <date_deletion_record>2018-03-21</date_deletion_record>
  <level_of_description>
    <p>Example level of description</p>
  </level_of_description>
  <extent_medium>
    <p>Example extent</p>
  </extent_medium>
  <name_creator>
    <p>Example name of creator</p>
  </name_creator>
  <archival_history>
    <p>Example archival history. Multi-line data - multi-paragraph.</p>
    <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam et nibh nec nulla vulputate ullamcorper at vitae massa. Aliquam hen

In [164]:
excel = pd.ExcelFile(data)
for sn in excel.sheet_names[1:]:
    df = pd.read_excel(data, sn)
    term_name = df.columns[0].lower()
    
    xml = terms_to_xml(df[df.columns[0]], term_name.replace(' ', '_'))
    etree.dump(xml.getroot())
    xml.write('{}.xml'.format(term_name), encoding='utf-8', method='xml', pretty_print=True)

<subjects>
  <p>Example subject 1</p>
  <p>Example subject 2</p>
  <p>Example subject 3</p>
  <p>Example subject 4</p>
</subjects>
<places>
  <p>Example place 1</p>
  <p>Example place 2</p>
  <p>Example place 3</p>
  <p>Example place 4</p>
</places>
<personal_names>
  <p>Example name 1</p>
  <p>Example name 2</p>
  <p>Example name 3</p>
  <p>Example name 4</p>
</personal_names>
<corporate_names>
  <p>Example corporate 1</p>
  <p>Example corporate 2</p>
  <p>Example corporate 3</p>
  <p>Example corporate 4</p>
</corporate_names>
