# XML Parsing

- The ElementTree XML API Docs: https://docs.python.org/3/library/xml.etree.elementtree.html
- Python XML with ElementTree: Beginner's Guide: https://www.datacamp.com/community/tutorials/python-xml-elementtree

In [5]:
import xml.etree.ElementTree as ET

# read xml file and get root node
tree = ET.parse('./data/country_data.xml')
root = tree.getroot()

print(root.tag)
print(root.attrib)

# iterate over root child nodes
for child in root:
    print(child.tag, child.attrib)

# Get at a specific root grandchild node
root[0][1].text

data
{}
country {'name': 'Liechtenstein'}
country {'name': 'Singapore'}
country {'name': 'Panama'}


'2008'

In [6]:
# # read xml string and get root node
country_data_as_string = """
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>
"""
root = ET.fromstring(country_data_as_string)

print(root.tag)
print(root.attrib)

# iterate over root child nodes
for child in root:
    print(child.tag, child.attrib)

# Get at a specific root grandchild node
root[0][1].text

data
{}
country {'name': 'Liechtenstein'}
country {'name': 'Singapore'}
country {'name': 'Panama'}


'2008'

In [7]:
# Pull API for non-blocking parsing
# Use XMLPullParser.feed() to read data incrementally
parser = ET.XMLPullParser(['start', 'end'])
parser.feed(country_data_as_string)
list(parser.read_events())

[('start', <Element 'data' at 0x000001181178EC70>),
 ('start', <Element 'country' at 0x000001181178EDB0>),
 ('start', <Element 'rank' at 0x000001181178EF90>),
 ('end', <Element 'rank' at 0x000001181178EF90>),
 ('start', <Element 'year' at 0x000001181178EC20>),
 ('end', <Element 'year' at 0x000001181178EC20>),
 ('start', <Element 'gdppc' at 0x000001181178EE50>),
 ('end', <Element 'gdppc' at 0x000001181178EE50>),
 ('start', <Element 'neighbor' at 0x000001181178ECC0>),
 ('end', <Element 'neighbor' at 0x000001181178ECC0>),
 ('start', <Element 'neighbor' at 0x000001181178EF40>),
 ('end', <Element 'neighbor' at 0x000001181178EF40>),
 ('end', <Element 'country' at 0x000001181178EDB0>),
 ('start', <Element 'country' at 0x000001181178E040>),
 ('start', <Element 'rank' at 0x00000118117AD2C0>),
 ('end', <Element 'rank' at 0x00000118117AD2C0>),
 ('start', <Element 'year' at 0x00000118117AD270>),
 ('end', <Element 'year' at 0x00000118117AD270>),
 ('start', <Element 'gdppc' at 0x00000118117AD360>),


In [8]:
parser = ET.XMLPullParser(['start', 'end'])
parser.feed(country_data_as_string)
for event, elem in parser.read_events():
    print(event)
    print(elem.tag, 'text=', elem.text)

start
data text= 
    
start
country text= 
        
start
rank text= 1
end
rank text= 1
start
year text= 2008
end
year text= 2008
start
gdppc text= 141100
end
gdppc text= 141100
start
neighbor text= None
end
neighbor text= None
start
neighbor text= None
end
neighbor text= None
end
country text= 
        
start
country text= 
        
start
rank text= 4
end
rank text= 4
start
year text= 2011
end
year text= 2011
start
gdppc text= 59900
end
gdppc text= 59900
start
neighbor text= None
end
neighbor text= None
end
country text= 
        
start
country text= 
        
start
rank text= 68
end
rank text= 68
start
year text= 2011
end
year text= 2011
start
gdppc text= 13600
end
gdppc text= 13600
start
neighbor text= None
end
neighbor text= None
start
neighbor text= None
end
neighbor text= None
end
country text= 
        
end
data text= 
    


In [9]:
# Finding interesting elements
for neighbor in root.iter('neighbor'):
    print(neighbor.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [10]:
for country in root.findall('country'):
    rank = country.find('rank').text
    name = country.get('name')
    print(name, rank)

Liechtenstein 1
Singapore 4
Panama 68


In [12]:
# Modifying an XML File
tree = ET.parse('./data/country_data.xml')
root = tree.getroot()
for rank in root.iter('rank'):
    new_rank = int(rank.text) + 1
    rank.text = str(new_rank)
    rank.set('updated', 'yes')
tree.write('./data/output.xml')

## Try It Out: XML Parsing in Python

- https://www.geeksforgeeks.org/xml-parsing-python