# JSON / XML Parsing

## JSON Parsing

In [1]:
# importing the Python json library
import json

### Importing JSON

**From string**

In [2]:
# Example string
json_string = '{"first_name": "Guido", "last_name":"Rossum"}'
json_string

'{"first_name": "Guido", "last_name":"Rossum"}'

In [3]:
parsed_json = json.loads(json_string)
parsed_json

{u'first_name': u'Guido', u'last_name': u'Rossum'}

**From file**

In [4]:
# %load files/example.xml

In [5]:
with open('files/example.json', 'r') as f:
    content = f.read()

parsed_json = json.loads(content)
parsed_json

{u'glossary': {u'GlossDiv': {u'GlossList': {u'GlossEntry': {u'Abbrev': u'ISO 8879:1986',
     u'Acronym': u'SGML',
     u'GlossDef': {u'GlossSeeAlso': [u'GML', u'XML'],
      u'para': u'A meta-markup language, used to create markup languages such as DocBook.'},
     u'GlossSee': u'markup',
     u'GlossTerm': u'Standard Generalized Markup Language',
     u'ID': u'SGML',
     u'SortAs': u'SGML'}},
   u'title': u'S'},
  u'title': u'example glossary'}}

### Exporting to JSON

In [6]:
# Example dict
d = { 'first_name': 'Guido',
      'second_name': 'Rossum',
      'titles': ['BDFL', 'Developer'],
    }
d

{'first_name': 'Guido',
 'second_name': 'Rossum',
 'titles': ['BDFL', 'Developer']}

In [7]:
json_string = json.dumps(d)
json_string

'{"first_name": "Guido", "titles": ["BDFL", "Developer"], "second_name": "Rossum"}'

## XML Parsing

In [8]:
import xml.etree.ElementTree as ET

### Importing XML

**From string**

In [9]:
# Example string
xml_string = """<catalog>
   <book id="bk101">
      <author>Gambardella, Matthew</author>
      <title>XML Developer's Guide</title>
      <genre>Computer</genre>
   </book>
</catalog>"""
xml_string

'<catalog>\n   <book id="bk101">\n      <author>Gambardella, Matthew</author>\n      <title>XML Developer\'s Guide</title>\n      <genre>Computer</genre>\n   </book>\n</catalog>'

In [10]:
root = ET.fromstring(xml_string)
print root.tag
print root.attrib

catalog
{}


**From file**

In [11]:
# %load files/example.xml

In [12]:
tree = ET.parse('files/example.xml')
root = tree.getroot()
print root.tag
print root.attrib

catalog
{}


### Getting items

In [13]:
# Each XML element contains a tag, an attribute (optional), a text and a list of childs
# XML Element: <tag attr:attr_value> text </tag>

print root.tag            # tag    : name of XML element
print root.attrib         # attrib : attribute of XML element
print root.text           # text   : content of XML element
for child in root: # loop through all subchildrens
    print "\t", child.tag, child.attrib, child.text
    for subchild in child:
        print "\t\t", subchild.tag, subchild.attrib, subchild.text
    print

catalog
{}

   
	book {'id': 'bk101'} 
      
		author {} Gambardella, Matthew
		title {} XML Developer's Guide
		genre {} Computer
		price {} 44.95
		publish_date {} 2000-10-01
		description {} An in-depth look at creating applications
      with XML.

	book {'id': 'bk103'} 
      
		author {} Corets, Eva
		title {} Maeve Ascendant
		genre {} Fantasy
		price {} 5.95
		publish_date {} 2000-11-17
		description {} After the collapse of a nanotechnology
      society in England, the young survivors lay the
      foundation for a new society.

	book {'id': 'bk109'} 
      
		author {} Kress, Peter
		title {} Paradox Lost
		genre {} Science Fiction
		price {} 6.95
		publish_date {} 2000-11-02
		description {} After an inadvertant trip through a Heisenberg
      Uncertainty Device, James Salway discovers the problems
      of being quantum.



In [14]:
# findall('element_tag') - get direct childs of parent
for book in root.findall('book'):
    # Get items
    book_id = book.get('id')          # Get attribute 'id' of 'book' element
    title = book.find('title')        # Get first child named 'title'
    author = book.find('author')
    genre = book.find('genre')
    price = book.find('price')
    pdate = book.find('publish_date')
    descr = book.find('description')
    
    # Print items
    print "ID ", book_id
    print title.tag, title.text
    print author.tag, author.text
    print genre.tag, genre.text
    print price.tag, price.text
    print pdate.tag, pdate.text
    print descr.tag, descr.text
    print

ID  bk101
title XML Developer's Guide
author Gambardella, Matthew
genre Computer
price 44.95
publish_date 2000-10-01
description An in-depth look at creating applications
      with XML.

ID  bk103
title Maeve Ascendant
author Corets, Eva
genre Fantasy
price 5.95
publish_date 2000-11-17
description After the collapse of a nanotechnology
      society in England, the young survivors lay the
      foundation for a new society.

ID  bk109
title Paradox Lost
author Kress, Peter
genre Science Fiction
price 6.95
publish_date 2000-11-02
description After an inadvertant trip through a Heisenberg
      Uncertainty Device, James Salway discovers the problems
      of being quantum.



In [15]:
# iter('element_tag') - search all subtrees
for title in root.iter('title'): 
    print title.text

XML Developer's Guide
Maeve Ascendant
Paradox Lost


In [16]:
# find('element_tag') - finds first child with tag
for book_content in root.find('book'):
    print book_content

<Element 'author' at 0x108283cd0>
<Element 'title' at 0x108283d50>
<Element 'genre' at 0x108283dd0>
<Element 'price' at 0x108283e10>
<Element 'publish_date' at 0x108283e50>
<Element 'description' at 0x108283e90>


### Modifying items

In [17]:
# set('attribute', 'attribute_value')
for price in root.iter('price'):
    price.text = str(float(price.text) + 1)
    price.set('increased', 'yes')
    print price.text, price.attrib

45.95 {'increased': 'yes'}
6.95 {'increased': 'yes'}
7.95 {'increased': 'yes'}


In [18]:
# Create new element
new_book = ET.Element('book', {'id': 'bk113'})
new_book.text = "\n"
                      
# Create sub elements
author = ET.SubElement(new_book, 'author')
title = ET.SubElement(new_book, 'title')
genre = ET.SubElement(new_book, 'genre')
price = ET.SubElement(new_book, 'price')
pdate = ET.SubElement(new_book, 'publish_date')
descr = ET.SubElement(new_book, 'description')

# Populate sub elements
author.text = "J.K Rowlings"
title.text = "Harry Potter and the Sorcerer's Stone"
genre.text = "Fantasy"
price.text = "31.50"
pdate.text = "2001-10-16"
descr.text = "A very nice fantasy book."

# Add element to existing tree
root.append(new_book)

ET.dump(new_book)

<book id="bk113">
<author>J.K Rowlings</author><title>Harry Potter and the Sorcerer's Stone</title><genre>Fantasy</genre><price>31.50</price><publish_date>2001-10-16</publish_date><description>A very nice fantasy book.</description></book>


In [19]:
# A ten times nicer way of doing it (focus on what matters)

def create_new_element(element_name, attributes={}, elements={}):
    book = ET.Element('book', attributes)
    for key in elements:
        new_elem = ET.SubElement(book, key)
        new_elem.text = elements[key]
    return book

# Create new element
name     = 'book'
attr     = {'id': "bk114"}
elements = {'author': "J.K Rowlings",
            'title': "Harry Potter and the Chamber of Secrets",
            'genre': "Fantasy",
            'price': "35.50",
            'publish_date': "2002-10-15",
            'description': "The second volume of a very nice fantasy book."}
new_book = create_new_element(name, attr, elements)

# Append to existing tree
root.append(new_book)

ET.dump(root)

<catalog>
   <book id="bk101">
      <author>Gambardella, Matthew</author>
      <title>XML Developer's Guide</title>
      <genre>Computer</genre>
      <price increased="yes">45.95</price>
      <publish_date>2000-10-01</publish_date>
      <description>An in-depth look at creating applications
      with XML.</description>
   </book>
   <book id="bk103">
      <author>Corets, Eva</author>
      <title>Maeve Ascendant</title>
      <genre>Fantasy</genre>
      <price increased="yes">6.95</price>
      <publish_date>2000-11-17</publish_date>
      <description>After the collapse of a nanotechnology
      society in England, the young survivors lay the
      foundation for a new society.</description>
   </book>
   <book id="bk109">
      <author>Kress, Peter</author>
      <title>Paradox Lost</title>
      <genre>Science Fiction</genre>
      <price increased="yes">7.95</price>
      <publish_date>2000-11-02</publish_date>
      <description>After an inadvertant trip through a Heisenb

### Exporting to XML

In [20]:
tree.write("files/example_output.xml")

In [21]:
# %load files/example_output.xml

## Bonus: Pretty Print and Conversion

### Pretty Print

In [22]:
def indent(elem, level=0):
    i = "\n" + level*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for elem in elem:
            indent(elem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i
            
indent(root)
tree.write("files/example_output.xml")
ET.dump(tree)

<catalog>
  <book id="bk101">
    <author>Gambardella, Matthew</author>
    <title>XML Developer's Guide</title>
    <genre>Computer</genre>
    <price increased="yes">45.95</price>
    <publish_date>2000-10-01</publish_date>
    <description>An in-depth look at creating applications
      with XML.</description>
  </book>
  <book id="bk103">
    <author>Corets, Eva</author>
    <title>Maeve Ascendant</title>
    <genre>Fantasy</genre>
    <price increased="yes">6.95</price>
    <publish_date>2000-11-17</publish_date>
    <description>After the collapse of a nanotechnology
      society in England, the young survivors lay the
      foundation for a new society.</description>
  </book>
  <book id="bk109">
    <author>Kress, Peter</author>
    <title>Paradox Lost</title>
    <genre>Science Fiction</genre>
    <price increased="yes">7.95</price>
    <publish_date>2000-11-02</publish_date>
    <description>After an inadvertant trip through a Heisenberg
      Uncertainty Device, James Salwa

In [23]:
# %load files/example_output.xml

### Etree to Dict { }

In [24]:
from collections import defaultdict

def etree_to_dict(t):
    d = {t.tag: {} if t.attrib else None}
    children = list(t)
    if children:
        dd = defaultdict(list)
        for dc in map(etree_to_dict, children):
            for k, v in dc.iteritems():
                dd[k].append(v)
        d = {t.tag: {k:v[0] if len(v) == 1 else v for k, v in dd.iteritems()}}
    if t.attrib:
        d[t.tag].update(('@' + k, v) for k, v in t.attrib.iteritems())
    if t.text:
        text = t.text.strip()
        if children or t.attrib:
            if text:
              d[t.tag]['#text'] = text
        else:
            d[t.tag] = text
    return d

In [25]:
d = etree_to_dict(root)
d

{'catalog': {'book': [{'@id': 'bk101',
    'author': 'Gambardella, Matthew',
    'description': 'An in-depth look at creating applications\n      with XML.',
    'genre': 'Computer',
    'price': {'#text': '45.95', '@increased': 'yes'},
    'publish_date': '2000-10-01',
    'title': "XML Developer's Guide"},
   {'@id': 'bk103',
    'author': 'Corets, Eva',
    'description': 'After the collapse of a nanotechnology\n      society in England, the young survivors lay the\n      foundation for a new society.',
    'genre': 'Fantasy',
    'price': {'#text': '6.95', '@increased': 'yes'},
    'publish_date': '2000-11-17',
    'title': 'Maeve Ascendant'},
   {'@id': 'bk109',
    'author': 'Kress, Peter',
    'description': 'After an inadvertant trip through a Heisenberg\n      Uncertainty Device, James Salway discovers the problems\n      of being quantum.',
    'genre': 'Science Fiction',
    'price': {'#text': '7.95', '@increased': 'yes'},
    'publish_date': '2000-11-02',
    'title': 'Para

### Dict { } to Etree

In [26]:
def dict_to_etree(d):
    def _to_etree(d, root):
        if not d:
            pass
        elif isinstance(d, basestring):
            root.text = d
        elif isinstance(d, dict):
            for k,v in d.items():
                assert isinstance(k, basestring)
                if k.startswith('#'):
                    assert k == '#text' and isinstance(v, basestring)
                    root.text = v
                elif k.startswith('@'):
                    assert isinstance(v, basestring)
                    root.set(k[1:], v)
                elif isinstance(v, list):
                    for e in v:
                        _to_etree(e, ET.SubElement(root, k))
                else:
                    _to_etree(v, ET.SubElement(root, k))
        else: assert d == 'invalid type', (type(d), d)
    assert isinstance(d, dict) and len(d) == 1
    tag, body = next(iter(d.items()))
    node = ET.Element(tag)
    _to_etree(body, node)
    return node

In [28]:
t = dict_to_etree(d)
indent(t)
ET.dump(t)

<catalog>
  <book id="bk101">
    <description>An in-depth look at creating applications
      with XML.</description>
    <author>Gambardella, Matthew</author>
    <price increased="yes">45.95</price>
    <title>XML Developer's Guide</title>
    <publish_date>2000-10-01</publish_date>
    <genre>Computer</genre>
  </book>
  <book id="bk103">
    <description>After the collapse of a nanotechnology
      society in England, the young survivors lay the
      foundation for a new society.</description>
    <author>Corets, Eva</author>
    <price increased="yes">6.95</price>
    <title>Maeve Ascendant</title>
    <publish_date>2000-11-17</publish_date>
    <genre>Fantasy</genre>
  </book>
  <book id="bk109">
    <description>After an inadvertant trip through a Heisenberg
      Uncertainty Device, James Salway discovers the problems
      of being quantum.</description>
    <author>Kress, Peter</author>
    <price increased="yes">7.95</price>
    <title>Paradox Lost</title>
    <publish_dat

# REST API (Django)

# SQL Data Access

# NoSQL Data Access (DynamoDB)

# Exercise: Parse a switch record and output to MySQL