In [None]:
import pandas as pd
import xml.etree.ElementTree as et
import codecs

In [None]:
def parseTypes(xml):
    # Parse Types of Attributes
    for Resource in xml.findall("Resource"):
        if Resource.attrib['type'] == "BDNTERM":
            for attribute in Resource.find("Attributes").findall("attribute"): # Standard Attributes
                if not attribute.attrib['name'] in StdAttributes:
                    # Handle the case when "Type" appears in Standard AND Non-Standard Attributes
                    if attribute.attrib['name'] == "Type":
                        if "Type_" not in StdAttributes:
                            StdAttributes.append("Type_")
                    else:
                        StdAttributes.append(attribute.attrib['name'])

            if Resource.find("Dependencies"):
                flag = 0
                dependency = None
                for d in Resource.find("Dependencies").findall("dependency"):
                    if d.attrib['type'] == "I":
                        dependency = d
                    elif d.attrib['type'] == "A": # Tag or Related Term
                        tagORref = d
                        flag = 1
            else:
                continue
            
            if dependency != None:
                for r in dependency.findall("Resource"): # Non-Standard Attributes
                    if r.attrib['type'] not in Types:
                        Types.append(r.attrib['type'])
                    if r.attrib['type'] == "BDNATTRIB" and not (r.attrib['label'] in NonStdAttributes) and not(r.attrib['label'] in StdAttributes):
                        NonStdAttributes.append(r.attrib['label'])
                    elif r.attrib['type'] == "BDNNOTE" and not ("Notes" in NonStdAttributes): # Notes
                        NonStdAttributes.append("Notes")
            
            if flag == 0:
                continue
            
            for r in tagORref.findall("Resource"):
                if r.attrib['type'] == "BDNTAG" and not ("Tags" in NonStdAttributes): # Tags
                    NonStdAttributes.append("Tags")
                if r.attrib['type'] == "BDNTERMREF" and not ("Related Terms" in NonStdAttributes): # Related Terms
                    NonStdAttributes.append("Related Terms")

In [None]:
def parseValues(xml):    
    # Initialize Attribute value lists in the BDN dictionary
    for a in (StdAttributes + NonStdAttributes):
        BDN[a] = []
    
    # Parse Attribute values
    for Resource in xml.findall("Resource"):
        if Resource.attrib['type'] == "BDNTERM":
            # Parse Standard Attribute values
            BDN['Name'].append(Resource.attrib['label'])
            BDN['Path'].append(Resource.attrib['identity'])
            
            for attribute in Resource.find("Attributes").findall("attribute"):
                if attribute.attrib['name'] == "Description":
                    BDN[attribute.attrib['name']].append("\""+attribute.attrib['value']+"\"")
                else:
                    # Handle the case when "Type" appears in Standard AND Non-Standard Attributes
                    if attribute.attrib['name'] == "Type":
                        BDN["Type_"].append(attribute.attrib['value'])
                    else:
                        BDN[attribute.attrib['name']].append(attribute.attrib['value'])
                
            # Parse Non Standard Attribute values
            if Resource.find("Dependencies"):
                flag = 0
                dependency = None
                for d in Resource.find("Dependencies").findall("dependency"):
                    if d.attrib['type'] == "I":
                        dependency = d
                    elif d.attrib['type'] == "A": # Tag or Related Term
                        tagORref = d
                        flag = 1
            else:
                for a in NonStdAttributes:
                    BDN[a].append("")
                continue
            
            Values = {}
            # Handle multiple values per Attribute
            for a in NonStdAttributes:
                Values[a] = ""
            
            if dependency != None:
                for r in dependency.findall("Resource"):
                    if r.attrib['type'] == "BDNATTRIB":
                        for a in r.find("Attributes").findall("attribute"):
                            if a.attrib['name'] == "Value":
                                if len(Values[r.attrib['label']]) > 0:
                                    Values[r.attrib['label']] += ","
                                Values[r.attrib['label']] += a.attrib['value']
                    elif r.attrib['type'] == "BDNNOTE": # Notes
                        for a in r.find("Attributes").findall("attribute"):
                            if a.attrib['name'] == "Content":
                                if len(Values['Notes']) > 0:
                                    Values['Notes'] += ","
                                Values['Notes'] += a.attrib['value']
            
            if flag == 1:
                for r in tagORref.findall("Resource"):
                    if r.attrib['type'] == "BDNTAG": # Tags
                        if len(Values['Tags']) > 0:
                            Values['Tags'] += ","
                        Values['Tags'] += r.attrib['identity']
                for r in tagORref.findall("Resource"):
                    if r.attrib['type'] == "BDNTERMREF": # Related Terms
                        if len(Values['Related Terms']) > 0:
                            Values['Related Terms'] += ","
                        Values['Related Terms'] += r.attrib['identity']
            
            for a in NonStdAttributes:
                BDN[a].append(Values[a])

In [None]:
def bdncsv():
    rf = codecs.open("/csv/Import - temp.csv", 'r', encoding='utf-8')
    wf = codecs.open("/csv/Import.csv", 'w', encoding='utf-8') # BDN CSV import file
    for line in rf:
        line = line.replace('"""', '"') # Replace (""") with (")
        line = line.replace("Type_", "Type") # Handle the case when "Type" appears in Standard AND Non-Standard Attributes
        wf.write(line)
    rf.close()
    wf.close()

In [None]:
global Types, StdAttributes, NonStdAttributes, BDN, df

xml = et.parse("/xml/Export.xml")
Types = [] # list of different Attribute types (for debugging purposes)
StdAttributes = ["Name", "Path"] # list of different Attributes (Standard)
NonStdAttributes = [] # list of different Attributes (Non-Standard)
BDN = {} # BDN structure as a dictionary of lists

parseTypes(xml)
parseValues(xml)

df = pd.DataFrame(BDN)
df = df.sort_values(by=['Path'])
df.to_csv("/csv/Import - temp.csv", index=False, encoding='utf-8')

bdncsv()

In [None]:
df.head()