# XML

In [None]:
# import XML package
import xml.etree.ElementTree as ET

# create the XML Element Tree object
tree = ET.parse('supporting-files/exampleresearcharticle.xml')
type(tree)

In [None]:
# find the element that I want
root = tree.getroot()
type(root)

In [None]:
print("\nChildren of root")

# print out each direct child element's tag in the root object
for child in root:
    # .tag attribute being used here
    print(child.tag)

##### Use XPATH to find specific elements

In [None]:
# find the first element matching the xpath request
title = root.find('./fm/bibl/title')
title

Pull out the title:

In [None]:
# placeholder for text
title_text = ""

# loops through title element
for p in title:
    # in-place appends placeholder for text
    title_text += p.text
title_text

Pull out the Author's email address:

In [None]:
# element.findall() method finds all matching subelements by tag name or path
# loops and prints for each item found
for a in root.findall('./fm/bibl/aug/au'):
    email = a.find('email').text
    if email is not None:
        print(email)

##### Focus in on more specific data scraping

In [None]:
article_file = "supporting-files/exampleResearchArticle.xml"

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None,
                "insr": []
        }

        # YOUR CODE HERE
        data['fnm'] = author.find('fnm').text
        data['snm'] = author.find('snm').text
        data['email'] = author.find('email').text
        insr = author.findall('./insr')
        for i in insr:
            data['insr'].append(i.attrib['iid'])

        authors.append(data)

    return authors
root = get_root(article_file)
data = get_authors(root)
data

##### Use parsing to count number of tags in an XML file

In [None]:
import xml.etree.ElementTree as ET

# REQS:
    # make dict
# EX:
    # REQS:
        # make dict
    # RULES:
        # node must be uniquely identified
        # node must be added to a dict and not repeated
        # use xml package to make root
    # COMP:
        # iterate over each line
            # if does not contain '/'
                # add to dict
                    # if exists already, add and increment value by 1
                    # else, just add
    # DECOMP:
        # Iterate over each line, add node name to dict with value 1
            # if item is not in dict already, add and make value 1
            # if in dict already, increment that key's value by 1
    
def count_tags(filename):
    # create XML tree objet
    tree = ET.parse(filename)
    
    # create root object to parse the XML tree
    root = tree.getroot()
    
    # holds all tags (keys) and counts for each instance of key (value)
    tags_dict = {}
    
    # fills in dict with tags and counts
    for element in tree.iter():
        if element.tag in tags_dict:
            tags_dict[element.tag] += 1
        else:
            tags_dict[element.tag] = 1
            
    return tags_dict

file = 'sample-data/example.osm'
count_tags(file)

##### Use ITERATIVE parsing to count number of tags in an XML file

In [None]:
import xml.etree.ElementTree as ET

# REQS:
    # make dict
# EX:
    # REQS:
        # make dict
    # RULES:
        # node must be uniquely identified
        # node must be added to a dict and not repeated
        # use xml package to make root
    # COMP:
        # iterate over each line
            # if does not contain '/'
                # add to dict
                    # if exists already, add and increment value by 1
                    # else, just add
    # DECOMP:
        # Iterate over each line, add node name to dict with value 1
            # if item is not in dict already, add and make value 1
            # if in dict already, increment that key's value by 1
    
def count_tags(filename):
    # holds all tags (keys) and counts for each instance of key (value)
    tags_dict = {}
    
    # fills in dict with tags and counts
    # note, here every time a start event is found
    # the event and the element that it found
    for event, element in ET.iterparse(filename, events=('start',)):
        if element.tag in tags_dict:
            tags_dict[element.tag] += 1
            # just to show that .iterparse() returns the element
            print(element)
        else:
            tags_dict[element.tag] = 1
            # just to show that .iterparse() returns the event
            print(event)
            
    return tags_dict

file = 'sample-data/example.osm'
count_tags(file)

In [None]:
import xml.etree.cElementTree as ET
import pprint
import re

# REQS
    # return a count of each of four tag types in a dict
# EXEC
    # key_type
        # REQS
            # input: element tag and keys dict
            # output: an updated keys dict
        # RULES
            # k value of "tag" element must in one of the four categories
        # COMP
            # if not lower, then if not lower_colon, then if not problemchars, then other
        # DECOMP
            # EXPERIMENT: how to compare a string against a regex statement
            # TRY: Set up regex comp and implement if/else
                # EXPERIMENT: pull out the k value from <tag> elements
                    # element.attrib['k']
                # INVESTIGATE: why dict has wrong counts?
                    # needed to use .seach instead of .match


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):

    if element.tag == "tag":
        # find the k value for each element
        k_value = element.attrib['k']
        
        # check to see if k value is described by each regex
        if lower.search(k_value):
            keys['lower'] += 1
        elif lower_colon.search(k_value):
            keys['lower_colon'] += 1
        elif problemchars.search(k_value):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
        
        pass
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        
        # with each iteration of .iterparse(), key_type method is called
        keys = key_type(element, keys)

    return keys



def test():
    # You can use another testfile 'map.osm' to look at your solution
    # Note that the assertion below will be incorrect then.
    # Note as well that the test function here is only used in the Test Run;
    # when you submit, your code will be checked against a different dataset.
    
    # a dict 
    keys = process_map('sample-data/example2.osm')
    pprint.pprint(keys)
    assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}


if __name__ == "__main__":
    test()

In [None]:
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

# REQS
    # return a set of unique user IDs
# EXEC
    # get_user
        # REQS: 
            # in: an element from XML
            # out: a user id?
        # RULES
            # must pull value from 'uid' attribute
        # COMP
            # query the element for 'uid'
        # DECOMP
            # TRY: .attrib, print
            # NEED: tell difference between elements with a uid, elements without
    # process_map
        # REQS: 
            # in: a filename/path
            # out: a set of users
        # RULES
            # must call get_user()
            # 

# returns a user id only if element contains one
def get_user(element):
    try:
        return element.attrib['uid']
    except:
        pass


# creates and returns a set of unique user ids
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        users.add(get_user(element))

    # removes 'None' from set
    users.remove(None)
    return users


def test():

    users = process_map('sample-data/example3.osm')
    pprint.pprint(users)
    assert len(users) == 6



if __name__ == "__main__":
    test()

In [None]:
"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
'''
NOTES:
- Node: consists of a single point in space defined by its latitude, longitude and node id. It can have one or more tags.
- Way: an ordered list of nodes which normally also has at least one tag or is included within a Relation. Can
use nodes to make shapes and/or lines.
- Relation: consists of one or more tags and also an ordered list of one or more nodes, ways and/or relations 
as members which is used to define logical or geographic relationships between other elements.
'''

# REQS
    # populate mapping dict with all unexpected values mapped to what they should be
    # complete update_name function to fix any incorrect values
# EXEC
    # mapping dict
        # REQS
            # manually fill all unexpected values found in OMS file into the mapping variable
            # print out each last-line from address for each relevant element and inspect visually
        # RULES
            # unexpected value must be from OSM file, corrected value
        # COMP
            # EXPERIMENT: map out control flow to know where to print out
        # DECOM
            # done
    # update_name to fix incorrect values
        # REQS
            # input: an incorrect street name and a mapping dict
            # output: a corrected street name
        # RULES
            # the incorrect street name must match a key from the mapping dict
        # COMP
            # if incorrect street nmae is a key in mapping dict:
                # set name variable to to the corresponding key's value
        # DECOMP
            # EXP: what is control flow leading my update_name?
                # is called and supplied incorrect name and mapping
            # TEST: print mapping dict
            # EXP: can a string be aligned to a key in a dict?
            # TEST: reassign name to output from regex query, then try is in mapping.
            # TRY: Need to return the full street
                # append corrected name from mapping to previous part of street
                    # TRY: pulling only first part of street 
        

import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "sample-data/example4.osm"

# compile object for making comparisons against regex
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St.": "Street",
           "Ave": "Avenue",
           "Rd.": "Road"
          }


# input: empty dict-like and string w/ name of street
def audit_street_type(street_types, street_name):
    # use regex compile object to find the last word in the street_name string
    # traditionally the last word is the street, court, way, etc.
    # return will either be a match object, or None
    m = street_type_re.search(street_name)
    
    # if m is a match object (not None)
    if m:
        # .group() function pulls out the actual word
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


# quick test returns True or False depending on k value of an element
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    # open the osm file
    osm_file = open(osmfile, "r")
    
    # initialize a dict-like using defaultdict and a set
    street_types = defaultdict(set)
    
    # iteratively parse each element from OSM sequentially
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            # .iter creates new tree iterator from current element as root to iterate over (includes all sub elements)
            # input param is optional and specifies the specific tags I want (here only looking for "tags")
            for tag in elem.iter("tag"):
                # calling is_street_name, takes in a tag
                # outputs True or False
                if is_street_name(tag):
                    # calling audit_street_type function
                    # input params:
                        # street_types dict-like (is empty at this point)
                        # the v value from the tag (which is the name of the street)
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    # returns a dict-like
    return street_types


# input: incorrect name from st_types and the mapping dict
def update_name(name, mapping):
    
    # match object for regex comparison 
    compare_name = street_type_re.search(name)
    
    # fetch the regex comparison
    end_name = compare_name.group()
    
    # fetches the correct street naming based on mapping
    new_street = mapping[end_name]
    
    # finds everything in front of the regex comparison
    start_name = name.split(end_name,1)[0]
    
    name = start_name + new_street
    return name


def test():
    # expecting a dict-like
    st_types = audit(OSMFILE)
    assert len(st_types) == 3
    pprint.pprint(dict(st_types))

    # st_type is a single string (key)
    # ways is a set with one or more values
    for st_type, ways in st_types.items():
        for name in ways:
            # input: incorrect name from st_types and the mapping dict
            better_name = update_name(name, mapping)
            # prints incorrect name and what it will be mapped to
            print (name, "=>", better_name)
            # tests
            if name == "West Lexington St.":
                assert better_name == "West Lexington Street"
            if name == "Baldwin Rd.":
                assert better_name == "Baldwin Road"


if __name__ == '__main__':
    test()

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
After auditing is complete the next step is to prepare the data to be inserted into a SQL database.
To do so you will parse the elements in the OSM XML file, transforming them from document format to
tabular format, thus making it possible to write to .csv files.  These csv files can then easily be
imported to a SQL database as tables.

The process for this transformation is as follows:
- Use iterparse to iteratively step through each top level element in the XML
- Shape each element into several data structures using a custom function
- Utilize a schema and validation library to ensure the transformed data is in the correct format
- Write each data structure to the appropriate .csv files

We've already provided the code needed to load the data, perform iterative parsing and write the
output to csv files. Your task is to complete the shape_element function that will transform each
element into the correct format. To make this process easier we've already defined a schema (see
the schema.py file in the last code tab) for the .csv files and the eventual tables. Using the 
cerberus library we can validate the output against this schema to ensure it is correct.

## Shape Element Function
The function should take as input an iterparse Element object and return a dictionary.

### If the element top level tag is "node":
The dictionary returned should have the format {"node": .., "node_tags": ...}

The "node" field should hold a dictionary of the following top level node attributes:
- id
- user
- uid
- version
- lat
- lon
- timestamp
- changeset
All other attributes can be ignored

The "node_tags" field should hold a list of dictionaries, one per secondary tag. Secondary tags are
child tags of node which have the tag name/type: "tag". Each dictionary should have the following
fields from the secondary tag attributes:
- id: the top level node id attribute value
- key: the full tag "k" attribute value if no colon is present or the characters after the colon if one is.
- value: the tag "v" attribute value
- type: either the characters before the colon in the tag "k" value or "regular" if a colon
        is not present.

Additionally,

- if the tag "k" value contains problematic characters, the tag should be ignored
- if the tag "k" value contains a ":" the characters before the ":" should be set as the tag type
  and characters after the ":" should be set as the tag key
- if there are additional ":" in the "k" value they and they should be ignored and kept as part of
  the tag key. For example:

  <tag k="addr:street:name" v="Lincoln"/>
  should be turned into
  {'id': 12345, 'key': 'street:name', 'value': 'Lincoln', 'type': 'addr'}

- If a node has no secondary tags then the "node_tags" field should just contain an empty list.

The final return value for a "node" element should look something like:

{'node': {'id': 757860928,
          'user': 'uboot',
          'uid': 26299,
       'version': '2',
          'lat': 41.9747374,
          'lon': -87.6920102,
          'timestamp': '2010-07-22T16:16:51Z',
      'changeset': 5288876},
 'node_tags': [{'id': 757860928,
                'key': 'amenity',
                'value': 'fast_food',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'cuisine',
                'value': 'sausage',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'name',
                'value': "Shelly's Tasty Freeze",
                'type': 'regular'}]}

### If the element top level tag is "way":
The dictionary should have the format {"way": ..., "way_tags": ..., "way_nodes": ...}

The "way" field should hold a dictionary of the following top level way attributes:
- id
-  user
- uid
- version
- timestamp
- changeset

All other attributes can be ignored

The "way_tags" field should again hold a list of dictionaries, following the exact same rules as
for "node_tags".

Additionally, the dictionary should have a field "way_nodes". "way_nodes" should hold a list of
dictionaries, one for each nd child tag.  Each dictionary should have the fields:
- id: the top level element (way) id
- node_id: the ref attribute value of the nd tag
- position: the index starting at 0 of the nd tag i.e. what order the nd tag appears within
            the way element

The final return value for a "way" element should look something like:

{'way': {'id': 209809850,
         'user': 'chicago-buildings',
         'uid': 674454,
         'version': '1',
         'timestamp': '2013-03-13T15:58:04Z',
         'changeset': 15353317},
 'way_nodes': [{'id': 209809850, 'node_id': 2199822281, 'position': 0},
               {'id': 209809850, 'node_id': 2199822390, 'position': 1},
               {'id': 209809850, 'node_id': 2199822392, 'position': 2},
               {'id': 209809850, 'node_id': 2199822369, 'position': 3},
               {'id': 209809850, 'node_id': 2199822370, 'position': 4},
               {'id': 209809850, 'node_id': 2199822284, 'position': 5},
               {'id': 209809850, 'node_id': 2199822281, 'position': 6}],
 'way_tags': [{'id': 209809850,
               'key': 'housenumber',
               'type': 'addr',
               'value': '1412'},
              {'id': 209809850,
               'key': 'street',
               'type': 'addr',
               'value': 'West Lexington St.'},
              {'id': 209809850,
               'key': 'street:name',
               'type': 'addr',
               'value': 'Lexington'},
              {'id': '209809850',
               'key': 'street:prefix',
               'type': 'addr',
               'value': 'West'},
              {'id': 209809850,
               'key': 'street:type',
               'type': 'addr',
               'value': 'Street'},
              {'id': 209809850,
               'key': 'building',
               'type': 'regular',
               'value': 'yes'},
              {'id': 209809850,
               'key': 'levels',
               'type': 'building',
               'value': '1'},
              {'id': 209809850,
               'key': 'building_id',
               'type': 'chicago',
               'value': '366409'}]}
"""

# REQS
    # investigate initial code
    # complete the shape_element function to transform each "node" and "ways" element into the correct format
# EXEC 
    # investigate initial code
        # Algo:
            # process_map
                # open files in write mode
                # create writer objects
                # write headers
                # create validation class object
                # call get_element
                    # create iterator object of (event, elem) pair
                    
    
    
    
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

from sample_data import schema

OSM_PATH = "sample_data/example5.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    if element.tag == 'node':
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    # parses XML into an element tree incrementally when iterated over,
    # returning an iterator object with (event, elem) pairs
    context = ET.iterparse(osm_file, events=('start', 'end'))
    
    # moves to next in iterator object, building the tree by one and
    # unpacking the event as '_' and the elem as '<osm'
    _, root = next(context)
    
    # loop over each remaining in iterator object
    # iterparse tries to rebuild the root each time
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            # adds each element to a generator
            yield elem
            # clears all subelements from root to prevent context object from getting huge
        root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #

# takes in osm file and validate as "True" or "False"
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    # open files in write mode
    with codecs.open(NODES_PATH, 'w') as nodes_file, \
        codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
        codecs.open(WAYS_PATH, 'w') as ways_file, \
        codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
        codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        # create writer objects: constructor specifications, (file_name, header_row_fields)
        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        # write headers using field names specified in DictWriter constructor
        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        # the Validator class normalizes and/or validates any mapping against
        # a validation schema which is provided as an argument at calss instantiation
        # or upon calling the validate(), validated() or normalized() methods
        validator = cerberus.Validator()

        # get_element called
            # uses same osm file input into process_map, specifies tags as tuple
        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)
                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)


##### Parsing an XML file in its entirety

In [None]:
# import the ElementTree XML API
import xml.etree.ElementTree as ET

# path to the XML file
OSM_PATH = "sample_data/example5.osm"

# create the ElementTree object
tree_obj = ET.parse(OSM_PATH)
tree_obj

The ElementTree object itself is not iterable. Its root element (parent to all other elements and of type Element), however, is a reference to the tree and is iterable:

In [None]:
# return the root element of the tree
root = tree_obj.getroot()
root

In [None]:
# iterate over the children and children's children
for child in root:
    if child.tag == "node":
        for child2 in child:
            print(child2)

For debugging purposes, dump() returns a complete representation of ALL elements of an Element object:

In [None]:
ET.dump(root)

##### Parsing an XML file incrementally

In [None]:
import xml.etree.ElementTree as ET

OSM_PATH = "sample_data/example5.osm"

# create the IterParseIterator object
iter_obj = ET.iterparse(OSM_PATH, events=('start','end'))
iter_obj

The IterParseIterator object is directly iterable, returning (event, elem) tuple pairs:

In [None]:
event, root = next(iter_obj)
root

The remainder of the object (all elements and sub-elements) can be iterated over in a for loop:

In [None]:
for event, elem in iter_obj:
    print(elem)

In [None]:
ET.dump(root)

clear() removes all the insides of any given element - so here clearing the root removes everything inside of it including its children.

In [None]:
root.clear()

In [None]:
ET.dump(root)

##### The problem with memory and incremental parsing

Incremental parsing of very large XML files is problematic because it can create a very large tree that can gobble up loads of computer memory.

In [1]:
import xml.etree.ElementTree as ET
from sys import getsizeof

OSM_PATH = "sample_data/example5.osm"

# create the IterParseIterator object
iterobj = ET.iterparse(OSM_PATH, events=('start','end'))
iterobj

<xml.etree.ElementTree._IterParseIterator at 0x104845710>

In [2]:
# pull out the reference to the root element
event, root = next(iterobj)
root

<Element 'osm' at 0x104874868>

To avoid making a very large tree when parsing a very large XML file, the clear() method from ElementTree can clear each element being built with each iteration of the loop:

In [3]:
# iterate over all remaining, clearing any elements with certain tags
for event, elem in iterobj:
    if (elem.tag == "bounds") or (elem.tag == "node") or (elem.tag == "way") or (elem.tag == "relation"):
        elem.clear()

In [4]:
ET.dump(root)

<osm attribution="http://www.openstreetmap.org/copyright" copyright="OpenStreetMap and contributors" generator="CGImap 0.3.3 (28791 thorn-03.openstreetmap.org)" license="http://opendatacommons.org/licenses/odbl/1-0/" version="0.6">
 <bounds /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><node /><way /><node /><way /><relation /></osm>


Note that the root reference was returned first. Then references to all the other elements were returned in the for loop, but when dumping the root those empty tags (the elements that were cleared but their tags left as artifacts) are still there. This demonstrates the fact that each unpacked element is a REFERENCE to the tree!

Still, clearing elements like this is only slightly less problematic because we're now left with the parent element and a bunch of child elements with their internals cleared out.

To fix this, we can clear everything within the root element with each iteration of the for loop. This clears out the sub-element that has just been added to the tree.

In [5]:
import xml.etree.ElementTree as ET
from sys import getsizeof

OSM_PATH = "sample_data/example5.osm"

# create the IterParseIterator object
iterobj = ET.iterparse(OSM_PATH, events=('start','end'))
iterobj

<xml.etree.ElementTree._IterParseIterator at 0x10486f8d0>

In [6]:
event, root = next(iterobj)
root

<Element 'osm' at 0x10487cf48>

Without clearing anything, dumping the root shows that, if iterated over, the tree is fully populated.

In [7]:
ET.dump(root)

<osm attribution="http://www.openstreetmap.org/copyright" copyright="OpenStreetMap and contributors" generator="CGImap 0.3.3 (28791 thorn-03.openstreetmap.org)" license="http://opendatacommons.org/licenses/odbl/1-0/" version="0.6">
 <bounds maxlat="41.9758200" maxlon="-87.6894800" minlat="41.9704500" minlon="-87.6928300" />
 <node changeset="11129782" id="261114295" lat="41.9730791" lon="-87.6866303" timestamp="2012-03-28T18:31:23Z" uid="451048" user="bbmiller" version="7" visible="true" />
 <node changeset="8448766" id="261114296" lat="41.9730416" lon="-87.6878512" timestamp="2011-06-15T17:04:54Z" uid="451048" user="bbmiller" version="6" visible="true" />
 <node changeset="8581395" id="261114299" lat="41.9729565" lon="-87.6939548" timestamp="2011-06-29T14:14:14Z" uid="451048" user="bbmiller" version="5" visible="true" />
 <node changeset="8581395" id="261146436" lat="41.9707380" lon="-87.6976025" timestamp="2011-06-29T14:14:14Z" uid="451048" user="bbmiller" version="5" visible="true" 

Clearing the root with each iteration of the below for loop will remove each element just as it's added to the tree. So the tree never has an opportunity to be built.

In [8]:
for event, elem in iterobj:
    print("Element was added to the tree")
    print("Element was: ", elem)
    root.clear()
    print("\nAll elements cleared via clear()\n")

Element was added to the tree
Element was:  <Element 'bounds' at 0x10487cdb8>

All elements cleared via clear()

Element was added to the tree
Element was:  <Element 'bounds' at 0x10487cdb8>

All elements cleared via clear()

Element was added to the tree
Element was:  <Element 'node' at 0x10487c9a8>

All elements cleared via clear()

Element was added to the tree
Element was:  <Element 'node' at 0x10487c9a8>

All elements cleared via clear()

Element was added to the tree
Element was:  <Element 'node' at 0x10487c958>

All elements cleared via clear()

Element was added to the tree
Element was:  <Element 'node' at 0x10487c958>

All elements cleared via clear()

Element was added to the tree
Element was:  <Element 'node' at 0x10487c908>

All elements cleared via clear()

Element was added to the tree
Element was:  <Element 'node' at 0x10487c908>

All elements cleared via clear()

Element was added to the tree
Element was:  <Element 'node' at 0x10487c8b8>

All elements cleared via clear(

Now we can see that dumping the root after the iteration process leaves us with an empty root:

In [9]:
ET.dump(root)

<osm />


No more memory being gummed up!!!