In [None]:
import pandas as pd
import xml.etree.ElementTree as et
import codecs

In [None]:
class BDN:
    def __init__(self):
        self.xml = et.parse("/xml/Export.xml")
        self.types = [] # list of different Attribute types (for debugging purposes)
        self.std_attrs = ["Name", "Path"] # list of different Attributes (Standard)
        self.non_std_attrs = [] # list of different Attributes (Non-Standard)
        self.bdn = {} # BDN structure as a dictionary of lists
        self.df = pd.DataFrame()
    
    def parse_types(self):
        xml = self.xml
        types = self.types
        std_attrs = self.std_attrs
        non_std_attrs = self.non_std_attrs
        
        # Parse Types of Attributes
        for Resource in xml.findall("Resource"):
            if Resource.attrib['type'] == "BDNTERM":
                for attribute in Resource.find("Attributes").findall("attribute"): # Standard Attributes
                    if not attribute.attrib['name'] in std_attrs:
                        # Handle the case when "Type" appears in Standard AND Non-Standard Attributes
                        if attribute.attrib['name'] == "Type":
                            if "Type_" not in std_attrs:
                                std_attrs.append("Type_")
                        else:
                            std_attrs.append(attribute.attrib['name'])

                if Resource.find("Dependencies"):
                    flag = 0
                    dependency = None
                    for d in Resource.find("Dependencies").findall("dependency"):
                        if d.attrib['type'] == "I":
                            dependency = d
                        elif d.attrib['type'] == "A": # Tag or Related Term
                            tagORref = d
                            flag = 1
                else:
                    continue
                
                if dependency != None:
                    for r in dependency.findall("Resource"): # Non-Standard Attributes
                        if r.attrib['type'] not in types:
                            types.append(r.attrib['type'])
                        if r.attrib['type'] == "BDNATTRIB" and not (r.attrib['label'] in non_std_attrs) and not(r.attrib['label'] in std_attrs):
                            non_std_attrs.append(r.attrib['label'])
                        elif r.attrib['type'] == "BDNNOTE" and not ("Notes" in non_std_attrs): # Notes
                            non_std_attrs.append("Notes")
                
                if flag == 0:
                    continue
                
                for r in tagORref.findall("Resource"):
                    if r.attrib['type'] == "BDNTAG" and not ("Tags" in non_std_attrs): # Tags
                        non_std_attrs.append("Tags")
                    if r.attrib['type'] == "BDNTERMREF" and not ("Related Terms" in non_std_attrs): # Related Terms
                        non_std_attrs.append("Related Terms")
    
    def parse_values(self):
        xml = self.xml
        std_attrs = self.std_attrs
        non_std_attrs = self.non_std_attrs
        bdn = self.bdn

        # Initialize Attribute value lists in the BDN dictionary
        for a in (std_attrs + non_std_attrs):
            bdn[a] = []
        
        # Parse Attribute values
        for Resource in xml.findall("Resource"):
            if Resource.attrib['type'] == "BDNTERM":
                # Parse Standard Attribute values
                bdn['Name'].append(Resource.attrib['label'])
                bdn['Path'].append(Resource.attrib['identity'])
                
                for attribute in Resource.find("Attributes").findall("attribute"):
                    if attribute.attrib['name'] == "Description":
                        bdn[attribute.attrib['name']].append("\""+attribute.attrib['value']+"\"")
                    else:
                        # Handle the case when "Type" appears in Standard AND Non-Standard Attributes
                        if attribute.attrib['name'] == "Type":
                            bdn["Type_"].append(attribute.attrib['value'])
                        else:
                            bdn[attribute.attrib['name']].append(attribute.attrib['value'])
                
                # Parse Non Standard Attribute values
                if Resource.find("Dependencies"):
                    flag = 0
                    dependency = None
                    for d in Resource.find("Dependencies").findall("dependency"):
                        if d.attrib['type'] == "I":
                            dependency = d
                        elif d.attrib['type'] == "A": # Tag or Related Term
                            tagORref = d
                            flag = 1
                else:
                    for a in non_std_attrs:
                        bdn[a].append("")
                    continue
                
                values = {}
                # Handle multiple values per Attribute
                for a in non_std_attrs:
                    values[a] = ""
                
                if dependency != None:
                    for r in dependency.findall("Resource"):
                        if r.attrib['type'] == "BDNATTRIB":
                            for a in r.find("Attributes").findall("attribute"):
                                if a.attrib['name'] == "Value":
                                    if len(values[r.attrib['label']]) > 0:
                                        values[r.attrib['label']] += ","
                                    values[r.attrib['label']] += a.attrib['value']
                        elif r.attrib['type'] == "BDNNOTE": # Notes
                            for a in r.find("Attributes").findall("attribute"):
                                if a.attrib['name'] == "Content":
                                    if len(values['Notes']) > 0:
                                        values['Notes'] += ","
                                    values['Notes'] += a.attrib['value']
                
                if flag == 1:
                    for r in tagORref.findall("Resource"):
                        if r.attrib['type'] == "BDNTAG": # Tags
                            if len(values['Tags']) > 0:
                                values['Tags'] += ","
                            values['Tags'] += r.attrib['identity']
                    for r in tagORref.findall("Resource"):
                        if r.attrib['type'] == "BDNTERMREF": # Related Terms
                            if len(values['Related Terms']) > 0:
                                values['Related Terms'] += ","
                            values['Related Terms'] += r.attrib['identity']
                
                for a in non_std_attrs:
                    bdn[a].append(values[a])
    
    def bdn_csv(self):
        bdn = self.bdn

        self.df = pd.DataFrame(bdn)
        self.df = self.df.sort_values(by=['Path'])
        self.df.to_csv("/csv/Import - temp.csv", index=False, encoding='utf-8')

        rf = codecs.open("/csv/Import - temp.csv", 'r', encoding='utf-8')
        wf = codecs.open("/csv/Import.csv", 'w', encoding='utf-8') # BDN CSV import file
        for line in rf:
            line = line.replace('"""', '"') # Replace (""") with (")
            line = line.replace("Type_", "Type") # Handle the case when "Type" appears in Standard AND Non-Standard Attributes
            wf.write(line)
        rf.close()
        wf.close()

In [None]:
def parseTypes(xml):
    # Parse Types of Attributes
    for Resource in xml.findall("Resource"):
        if Resource.attrib['type'] == "BDNTERM":
            for attribute in Resource.find("Attributes").findall("attribute"): # Standard Attributes
                if not attribute.attrib['name'] in std_attrs:
                    # Handle the case when "Type" appears in Standard AND Non-Standard Attributes
                    if attribute.attrib['name'] == "Type":
                        if "Type_" not in std_attrs:
                            std_attrs.append("Type_")
                    else:
                        std_attrs.append(attribute.attrib['name'])

            if Resource.find("Dependencies"):
                flag = 0
                dependency = None
                for d in Resource.find("Dependencies").findall("dependency"):
                    if d.attrib['type'] == "I":
                        dependency = d
                    elif d.attrib['type'] == "A": # Tag or Related Term
                        tagORref = d
                        flag = 1
            else:
                continue
            
            if dependency != None:
                for r in dependency.findall("Resource"): # Non-Standard Attributes
                    if r.attrib['type'] not in types:
                        types.append(r.attrib['type'])
                    if r.attrib['type'] == "BDNATTRIB" and not (r.attrib['label'] in non_std_attrs) and not(r.attrib['label'] in std_attrs):
                        non_std_attrs.append(r.attrib['label'])
                    elif r.attrib['type'] == "BDNNOTE" and not ("Notes" in non_std_attrs): # Notes
                        non_std_attrs.append("Notes")
            
            if flag == 0:
                continue
            
            for r in tagORref.findall("Resource"):
                if r.attrib['type'] == "BDNTAG" and not ("Tags" in non_std_attrs): # Tags
                    non_std_attrs.append("Tags")
                if r.attrib['type'] == "BDNTERMREF" and not ("Related Terms" in non_std_attrs): # Related Terms
                    non_std_attrs.append("Related Terms")

In [None]:
def parseValues(xml):    
    # Initialize Attribute value lists in the BDN dictionary
    for a in (std_attrs + non_std_attrs):
        bdn[a] = []
    
    # Parse Attribute values
    for Resource in xml.findall("Resource"):
        if Resource.attrib['type'] == "BDNTERM":
            # Parse Standard Attribute values
            bdn['Name'].append(Resource.attrib['label'])
            bdn['Path'].append(Resource.attrib['identity'])
            
            for attribute in Resource.find("Attributes").findall("attribute"):
                if attribute.attrib['name'] == "Description":
                    bdn[attribute.attrib['name']].append("\""+attribute.attrib['value']+"\"")
                else:
                    # Handle the case when "Type" appears in Standard AND Non-Standard Attributes
                    if attribute.attrib['name'] == "Type":
                        bdn["Type_"].append(attribute.attrib['value'])
                    else:
                        bdn[attribute.attrib['name']].append(attribute.attrib['value'])
                
            # Parse Non Standard Attribute values
            if Resource.find("Dependencies"):
                flag = 0
                dependency = None
                for d in Resource.find("Dependencies").findall("dependency"):
                    if d.attrib['type'] == "I":
                        dependency = d
                    elif d.attrib['type'] == "A": # Tag or Related Term
                        tagORref = d
                        flag = 1
            else:
                for a in non_std_attrs:
                    bdn[a].append("")
                continue
            
            values = {}
            # Handle multiple values per Attribute
            for a in non_std_attrs:
                values[a] = ""
            
            if dependency != None:
                for r in dependency.findall("Resource"):
                    if r.attrib['type'] == "BDNATTRIB":
                        for a in r.find("Attributes").findall("attribute"):
                            if a.attrib['name'] == "Value":
                                if len(values[r.attrib['label']]) > 0:
                                    values[r.attrib['label']] += ","
                                values[r.attrib['label']] += a.attrib['value']
                    elif r.attrib['type'] == "BDNNOTE": # Notes
                        for a in r.find("Attributes").findall("attribute"):
                            if a.attrib['name'] == "Content":
                                if len(values['Notes']) > 0:
                                    values['Notes'] += ","
                                values['Notes'] += a.attrib['value']
            
            if flag == 1:
                for r in tagORref.findall("Resource"):
                    if r.attrib['type'] == "BDNTAG": # Tags
                        if len(values['Tags']) > 0:
                            values['Tags'] += ","
                        values['Tags'] += r.attrib['identity']
                for r in tagORref.findall("Resource"):
                    if r.attrib['type'] == "BDNTERMREF": # Related Terms
                        if len(values['Related Terms']) > 0:
                            values['Related Terms'] += ","
                        values['Related Terms'] += r.attrib['identity']
            
            for a in non_std_attrs:
                bdn[a].append(values[a])

In [None]:
def bdncsv():
    rf = codecs.open("/csv/Import - temp.csv", 'r', encoding='utf-8')
    wf = codecs.open("/csv/Import.csv", 'w', encoding='utf-8') # BDN CSV import file
    for line in rf:
        line = line.replace('"""', '"') # Replace (""") with (")
        line = line.replace("Type_", "Type") # Handle the case when "Type" appears in Standard AND Non-Standard Attributes
        wf.write(line)
    rf.close()
    wf.close()

In [None]:
global types, std_attrs, non_std_attrs, bdn, df

xml = et.parse("/xml/Export.xml")
types = [] # list of different Attribute types (for debugging purposes)
std_attrs = ["Name", "Path"] # list of different Attributes (Standard)
non_std_attrs = [] # list of different Attributes (Non-Standard)
bdn = {} # BDN structure as a dictionary of lists

parseTypes(xml)
parseValues(xml)

df = pd.DataFrame(bdn)
df = df.sort_values(by=['Path'])
df.to_csv("/csv/Import - temp.csv", index=False, encoding='utf-8')

bdncsv()

In [None]:
df.head()

In [None]:
bdn = BDN()

bdn.parse_types()
bdn.parse_values()

bdn.bdn_csv()

In [None]:
bdn.df.head()