# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
document = ET.parse( './data/mondial_database.xml' )



In [7]:
# 10 countries with the lowest infant mortality rates
import operator
root = document.getroot()
c_inf_mort = {}
for child in root.iterfind('country'):
    for tmp in child:
        if tmp.tag == 'name':
            c = tmp.text
        if tmp.tag == 'infant_mortality':
            c_inf_mort[c] = float(tmp.text)
            
c_i_m_s = sorted(c_inf_mort.items(), key=operator.itemgetter(1))
for i in range(10):
    print c_i_m_s[i]

('Monaco', 1.81)
('Japan', 2.13)
('Bermuda', 2.48)
('Norway', 2.48)
('Singapore', 2.53)
('Sweden', 2.6)
('Czech Republic', 2.63)
('Hong Kong', 2.73)
('Macao', 3.13)
('Iceland', 3.15)


In [8]:
# 10 cities with the largest population
city_pop = {}
for child in root.iter('city'):
    for sub in child:
        if sub.tag == 'name':
            city = sub.text
        if sub.tag == 'population':
            city_pop[city] = int(sub.text)
ci_pop = sorted(city_pop.items(), key=operator.itemgetter(1))
ci_pop.reverse()
for i in range(10):
    print ci_pop[i]

('Shanghai', 22315474)
('Istanbul', 13710512)
('Mumbai', 12442373)
('Moscow', 11979529)
('Beijing', 11716620)
(u'S\xe3o Paulo', 11152344)
('Tianjin', 11090314)
('Guangzhou', 11071424)
('Delhi', 11034555)
('Shenzhen', 10358381)


In [9]:
# 10 ethnic groups with the largest overall population (sum of latest estimates over each country)
ethnic_group = {}
for child in root.iterfind('country'):
    for tmp in child:
        if tmp.tag == 'name':
            c = tmp.text
        if tmp.tag == 'population':
            pop = float(tmp.text)
        if tmp.tag == 'ethnicgroup':
            group = tmp.text
            gr_pop = (float(tmp.get('percentage'))/100)*pop
            if group in ethnic_group:
                ethnic_group[group] = ethnic_group[group]+gr_pop
            else:
                ethnic_group[group] = gr_pop
eth_gr = sorted(ethnic_group.items(), key=operator.itemgetter(1))
eth_gr.reverse()
for i in range(10):
    print eth_gr[i]

('Han Chinese', 1245058800.0)
('Indo-Aryan', 871815583.4399999)
('European', 494872219.7195999)
('African', 318325120.36899996)
('Dravidian', 302713744.25)
('Mestizo', 157734354.93699998)
('Bengali', 146776916.72)
('Russian', 131856996.077)
('Japanese', 126534212.00000001)
('Malay', 121993550.374)


In [24]:
# Name and country of a) longest river, b) largest lake and c) airport at highest elevation

river_l = {}
river_c = {}
lake_a = {}
lake_c = {}
airport_e = {}
airport_c = {}
for child in root.iterfind('river'):
    for grchild in child:
        if grchild.tag == 'name':
            ri = grchild.text
        if grchild.tag == 'length':
            le = float(grchild.text)
            river_l[ri] = le
            river_c[ri] = child.get('country')
ri_l_s = sorted(river_l.items(), key=operator.itemgetter(1))
print ri_l_s[-1][0], river_c[ri_l_s[-1][0]], ri_l_s[-1][1]
for child in root.iterfind('lake'):
    for grchild in child:
        if grchild.tag == 'name':
            la = grchild.text
        if grchild.tag == 'area':
            ar = float(grchild.text)
            lake_a[la] = ar
            lake_c[la] = child.get('country')
la_l_s = sorted(lake_a.items(), key=operator.itemgetter(1))
print la_l_s[-1][0], lake_c[la_l_s[-1][0]], la_l_s[-1][1]
for child in root.iterfind('airport'):
    for grchild in child:
        if grchild.tag == 'name':
            ap = grchild.text
        if grchild.tag == 'elevation':
            try:
                int(grchild.text)
            except:
                next
            else:
                el = int(grchild.text)
                airport_e[ap] = el
                airport_c[ap] = child.get('country')
ap_l_s = sorted(airport_e.items(), key=operator.itemgetter(1))
print ap_l_s[-1][0], airport_c[ap_l_s[-1][0]], ap_l_s[-1][1]


Amazonas CO BR PE 6448.0
Caspian Sea R AZ KAZ IR TM 386400.0
El Alto Intl BOL 4063
