# XML

In [1]:
# import XML package
import xml.etree.ElementTree as ET

# create the XML Element Tree object
tree = ET.parse('supporting-files/exampleresearcharticle.xml')
type(tree)

xml.etree.ElementTree.ElementTree

In [2]:
# find the element that I want
root = tree.getroot()
type(root)

xml.etree.ElementTree.Element

In [9]:
print("\nChildren of root")

# print out each direct child element's tag in the root object
for child in root:
    # .tag attribute being used here
    print(child.tag)


Children of root
ui
ji
fm
bdy
bm


##### Use XPATH to find specific elements

In [31]:
# find the first element matching the xpath request
title = root.find('./fm/bibl/title')
title

<Element 'title' at 0x10486f048>

Pull out the title:

In [32]:
# placeholder for text
title_text = ""

# loops through title element
for p in title:
    # in-place appends placeholder for text
    title_text += p.text
title_text

'Standardization of the functional syndesmosis widening by dynamic U.S examination'

Pull out the Author's email address:

In [35]:
# element.findall() method finds all matching subelements by tag name or path
# loops and prints for each item found
for a in root.findall('./fm/bibl/aug/au'):
    email = a.find('email').text
    if email is not None:
        print(email)

omer@extremegate.com
mcarmont@hotmail.com
laver17@gmail.com
nyska@internet-zahav.net
kammarh@gmail.com
gideon.mann.md@gmail.com
barns.nz@gmail.com
eukots@gmail.com


##### Focus in on more specific data scraping

In [39]:
article_file = "supporting-files/exampleResearchArticle.xml"

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None,
                "insr": []
        }

        # YOUR CODE HERE
        data['fnm'] = author.find('fnm').text
        data['snm'] = author.find('snm').text
        data['email'] = author.find('email').text
        insr = author.findall('./insr')
        for i in insr:
            data['insr'].append(i.attrib['iid'])

        authors.append(data)

    return authors
root = get_root(article_file)
data = get_authors(root)
data

[{'email': 'omer@extremegate.com',
  'fnm': 'Omer',
  'insr': ['I1'],
  'snm': 'Mei-Dan'},
 {'email': 'mcarmont@hotmail.com',
  'fnm': 'Mike',
  'insr': ['I2'],
  'snm': 'Carmont'},
 {'email': 'laver17@gmail.com',
  'fnm': 'Lior',
  'insr': ['I3', 'I4'],
  'snm': 'Laver'},
 {'email': 'nyska@internet-zahav.net',
  'fnm': 'Meir',
  'insr': ['I3'],
  'snm': 'Nyska'},
 {'email': 'kammarh@gmail.com',
  'fnm': 'Hagay',
  'insr': ['I8'],
  'snm': 'Kammar'},
 {'email': 'gideon.mann.md@gmail.com',
  'fnm': 'Gideon',
  'insr': ['I3', 'I5'],
  'snm': 'Mann'},
 {'email': 'barns.nz@gmail.com',
  'fnm': 'Barnaby',
  'insr': ['I6'],
  'snm': 'Clarck'},
 {'email': 'eukots@gmail.com', 'fnm': 'Eugene', 'insr': ['I7'], 'snm': 'Kots'}]

##### Use parsing to count number of tags in an XML file

In [25]:
import xml.etree.ElementTree as ET

# REQS:
    # make dict
# EX:
    # REQS:
        # make dict
    # RULES:
        # node must be uniquely identified
        # node must be added to a dict and not repeated
        # use xml package to make root
    # COMP:
        # iterate over each line
            # if does not contain '/'
                # add to dict
                    # if exists already, add and increment value by 1
                    # else, just add
    # DECOMP:
        # Iterate over each line, add node name to dict with value 1
            # if item is not in dict already, add and make value 1
            # if in dict already, increment that key's value by 1
    
def count_tags(filename):
    # create XML tree objet
    tree = ET.parse(filename)
    
    # create root object to parse the XML tree
    root = tree.getroot()
    
    # holds all tags (keys) and counts for each instance of key (value)
    tags_dict = {}
    
    # fills in dict with tags and counts
    for element in tree.iter():
        if element.tag in tags_dict:
            tags_dict[element.tag] += 1
        else:
            tags_dict[element.tag] = 1
            
    return tags_dict

file = 'sample-data/example.osm'
count_tags(file)

{'bounds': 1,
 'member': 3,
 'nd': 4,
 'node': 20,
 'osm': 1,
 'relation': 1,
 'tag': 7,
 'way': 1}

##### Use ITERATIVE parsing to count number of tags in an XML file

In [30]:
import xml.etree.ElementTree as ET

# REQS:
    # make dict
# EX:
    # REQS:
        # make dict
    # RULES:
        # node must be uniquely identified
        # node must be added to a dict and not repeated
        # use xml package to make root
    # COMP:
        # iterate over each line
            # if does not contain '/'
                # add to dict
                    # if exists already, add and increment value by 1
                    # else, just add
    # DECOMP:
        # Iterate over each line, add node name to dict with value 1
            # if item is not in dict already, add and make value 1
            # if in dict already, increment that key's value by 1
    
def count_tags(filename):
    # holds all tags (keys) and counts for each instance of key (value)
    tags_dict = {}
    
    # fills in dict with tags and counts
    # note, here every time a start event is found
    # the event and the element that it found
    for event, element in ET.iterparse(filename, events=('start',)):
        if element.tag in tags_dict:
            tags_dict[element.tag] += 1
            # just to show that .iterparse() returns the element
            print(element)
        else:
            tags_dict[element.tag] = 1
            # just to show that .iterparse() returns the event
            print(event)
            
    return tags_dict

file = 'sample-data/example.osm'
count_tags(file)

start
start
start
<Element 'node' at 0x104935a98>
<Element 'node' at 0x104871b88>
<Element 'node' at 0x104871868>
<Element 'node' at 0x1048faef8>
<Element 'node' at 0x1048fa458>
<Element 'node' at 0x1048fa728>
<Element 'node' at 0x1048fa818>
<Element 'node' at 0x1048fa7c8>
<Element 'node' at 0x1049837c8>
<Element 'node' at 0x104983ae8>
<Element 'node' at 0x104983ea8>
<Element 'node' at 0x1049bd9a8>
<Element 'node' at 0x1049bd9f8>
<Element 'node' at 0x1049bd728>
<Element 'node' at 0x1049bd048>
<Element 'node' at 0x1049bd868>
<Element 'node' at 0x1049bd7c8>
start
<Element 'node' at 0x1049bdea8>
<Element 'node' at 0x1049bdf48>
<Element 'tag' at 0x1049bdf98>
<Element 'tag' at 0x1049bd318>
<Element 'tag' at 0x1049bd228>
start
start
<Element 'nd' at 0x1049bdd18>
<Element 'nd' at 0x1049bdb88>
<Element 'nd' at 0x1049bddb8>
<Element 'tag' at 0x1049bdc28>
start
start
<Element 'member' at 0x104989048>
<Element 'member' at 0x104989098>
<Element 'tag' at 0x1049890e8>
<Element 'tag' at 0x104989138>


{'bounds': 1,
 'member': 3,
 'nd': 4,
 'node': 20,
 'osm': 1,
 'relation': 1,
 'tag': 7,
 'way': 1}

In [31]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into your database, you should check the
"k" value for each "<tag>" and see if there are any potential problems.
    # the k value is the key for a tag of something

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with
problematic characters.

Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.
See the 'process_map' and 'test' functions for examples of the expected format.
"""

# REQS
    # return a count of each of four tag types in a dict
# EXEC
    # key_type
        # REQS
            # input: element tag and keys dict
            # output: an updated keys dict
        # RULES
            # k value of "tag" element must in one of the four categories
        # COMP
            # if not lower, then if not lower_colon, then if not problemchars, then other
        # DECOMP
            # EXPERIMENT: how to compare a string against a regex statement
            # TRY: Set up regex comp and implement if/else
                # EXPERIMENT: pull out the k value from <tag> elements
                    # element.attrib['k']
                # INVESTIGATE: why dict has wrong counts?
                    # needed to use .seach instead of .match


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):

    if element.tag == "tag":
        # find the k value for each element
        k_value = element.attrib['k']
        
        # check to see if k value is described by each regex
        if lower.search(k_value):
            keys['lower'] += 1
        elif lower_colon.search(k_value):
            keys['lower_colon'] += 1
        elif problemchars.search(k_value):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
        
        pass
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        
        # with each iteration of .iterparse(), key_type method is called
        keys = key_type(element, keys)

    return keys



def test():
    # You can use another testfile 'map.osm' to look at your solution
    # Note that the assertion below will be incorrect then.
    # Note as well that the test function here is only used in the Test Run;
    # when you submit, your code will be checked against a different dataset.
    
    # a dict 
    keys = process_map('sample-data/example2.osm')
    pprint.pprint(keys)
    assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}


if __name__ == "__main__":
    test()

{'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}
