# Parsing XML documents

## Example XML

<?xml version="1.0" encoding="UTF-8"?>
<opml version="1.0">
<head>
	<title>My Podcasts</title>
	<dateCreated>Sun, 07 Mar 2010 15:53:26 GMT</dateCreated>
	<dateModified>Sun, 07 Mar 2010 15:53:26 GMT</dateModified>
</head>
<body>
  <outline text="Science and Tech">
    <outline text="APM: Future Tense" type="rss" 
             xmlUrl="http://www.publicradio.org/columns/futuretense/podcast.xml" 
             htmlUrl="http://www.publicradio.org/columns/futuretense/" />
	<outline text="Engines Of Our Ingenuity Podcast" type="rss" 
             xmlUrl="http://www.npr.org/rss/podcast.php?id=510030" 
             htmlUrl="http://www.uh.edu/engines/engines.htm" />
	<outline text="Science &#38; the City" type="rss" 
             xmlUrl="http://www.nyas.org/Podcasts/Atom.axd" 
             htmlUrl="http://www.nyas.org/WhatWeDo/SciencetheCity.aspx" />
  </outline>
  <outline text="Books and Fiction">
	<outline text="Podiobooker" type="rss" 
             xmlUrl="http://feeds.feedburner.com/podiobooks" 
             htmlUrl="http://www.podiobooks.com/blog" />
	<outline text="The Drabblecast" type="rss" 
             xmlUrl="http://web.me.com/normsherman/Site/Podcast/rss.xml" 
             htmlUrl="http://web.me.com/normsherman/Site/Podcast/Podcast.html" />
	<outline text="tor.com / category / tordotstories" type="rss" 
             xmlUrl="http://www.tor.com/rss/category/TorDotStories" 
             htmlUrl="http://www.tor.com/" />
  </outline>
  <outline text="Computers and Programming">
	<outline text="MacBreak Weekly" type="rss" 
             xmlUrl="http://leo.am/podcasts/mbw" 
             htmlUrl="http://twit.tv/mbw" />
	<outline text="FLOSS Weekly" type="rss" 
             xmlUrl="http://leo.am/podcasts/floss" 
             htmlUrl="http://twit.tv" />
	<outline text="Core Intuition" type="rss" 
             xmlUrl="http://www.coreint.org/podcast.xml" 
             htmlUrl="http://www.coreint.org/" />
  </outline>
  <outline text="Python">
    <outline text="PyCon Podcast" type="rss" 
             xmlUrl="http://advocacy.python.org/podcasts/pycon.rss" 
             htmlUrl="http://advocacy.python.org/podcasts/" />
	<outline text="A Little Bit of Python" type="rss" 
             xmlUrl="http://advocacy.python.org/podcasts/littlebit.rss" 
             htmlUrl="http://advocacy.python.org/podcasts/" />
	<outline text="Django Dose Everything Feed" type="rss" 
             xmlUrl="http://djangodose.com/everything/feed/" />
  </outline>
  <outline text="Miscelaneous">
	<outline text="dhellmann's CastSampler Feed" type="rss" 
             xmlUrl="http://www.castsampler.com/cast/feed/rss/dhellmann/" 
             htmlUrl="http://www.castsampler.com/users/dhellmann/" />
  </outline>
</body>
</opml>

In [7]:
from xml.etree import ElementTree

with open('podcasts.opml', 'rt') as f:
    tree = ElementTree.parse(f)

# iterate the outline node and print its attributes
for node in tree.iter('outline'):
    name = node.attrib.get('text')
    url = node.attrib.get('xmlUrl')
    if name and url:
        print("%s :: %s" %(name, url))
    else:
        print("\n%s" %name)


Science and Tech
APM: Future Tense :: http://www.publicradio.org/columns/futuretense/podcast.xml
Engines Of Our Ingenuity Podcast :: http://www.npr.org/rss/podcast.php?id=510030
Science & the City :: http://www.nyas.org/Podcasts/Atom.axd

Books and Fiction
Podiobooker :: http://feeds.feedburner.com/podiobooks
The Drabblecast :: http://web.me.com/normsherman/Site/Podcast/rss.xml
tor.com / category / tordotstories :: http://www.tor.com/rss/category/TorDotStories

Computers and Programming
MacBreak Weekly :: http://leo.am/podcasts/mbw
FLOSS Weekly :: http://leo.am/podcasts/floss
Core Intuition :: http://www.coreint.org/podcast.xml

Python
PyCon Podcast :: http://advocacy.python.org/podcasts/pycon.rss
A Little Bit of Python :: http://advocacy.python.org/podcasts/littlebit.rss
Django Dose Everything Feed :: http://djangodose.com/everything/feed/

Miscelaneous
dhellmann's CastSampler Feed :: http://www.castsampler.com/cast/feed/rss/dhellmann/


# Finding Nodes in a document

In [20]:
for node in tree.findall(".//outline/outline"):
    url = node.attrib.get("xmlUrl")
    print(url)

http://www.publicradio.org/columns/futuretense/podcast.xml
http://www.npr.org/rss/podcast.php?id=510030
http://www.nyas.org/Podcasts/Atom.axd
http://feeds.feedburner.com/podiobooks
http://web.me.com/normsherman/Site/Podcast/rss.xml
http://www.tor.com/rss/category/TorDotStories
http://leo.am/podcasts/mbw
http://leo.am/podcasts/floss
http://www.coreint.org/podcast.xml
http://advocacy.python.org/podcasts/pycon.rss
http://advocacy.python.org/podcasts/littlebit.rss
http://djangodose.com/everything/feed/
http://www.castsampler.com/cast/feed/rss/dhellmann/


# Parse Node Attributes

data.xml

<?xml version="1.0" encoding="UTF-8"?>
<top>
  <child>This child contains text.</child>
  <child_with_tail>This child has regular text.</child_with_tail>And "tail" text.
  <with_attributes name="value" foo="bar" />
  <entity_expansion attribute="This &#38; That">That &#38; This</entity_expansion>
</top>

In [39]:
from xml.etree import ElementTree

with open("data.xml", 'rt') as f:
    tree = ElementTree.parse(f)

node = tree.find("./with_attributes")
print(node.tag)
print(sorted(node.attrib.items()))

for name, value in sorted(node.attrib.items()):
    print("%-4s => %s" %(name, value))
    
# another way printing certain attribute values
for key in ["name", "foo"]:
    print("\nPrint '%s' attribute" %key)
    print("%s => %s" %(key, node.attrib[key]))
    

with_attributes
[('foo', 'bar'), ('name', 'value')]
foo  => bar
name => value

Print 'name' attribute
name => value

Print 'foo' attribute
foo => bar


In [34]:
for path in ["./child", "./child_with_tail"]:
    node = tree.find(path)
    print(node.tag)
    print(node.text)
    print(node.tail)

child
This child contains text.

  
child_with_tail
This child has regular text.
And "tail" text.
  


In [49]:
from xml.etree import ElementTree

xml_content=ElementTree.XML('''
<top>
  <parent id="A">
    <child id="4300110224" num="0"/>
    <child id="4300110288" num="1"/>
    <child id="4300110480" num="2"/>
  </parent>
  <parent id="B">
    <!-- <child id="4300110224" num="0"/> -->
    <child id="4300110288" num="1"/>
    <child id="4300110480" num="2"/>
  </parent>
</top>
''')

for node in xml_content.findall(".//child"):
    print(node.attrib["id"])

4300110224
4300110288
4300110480
4300110288
4300110480
