# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [8]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':', )
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [27]:
from operator import itemgetter, attrgetter

In [31]:
document = ET.parse( './data/mondial_database.xml' )
document_tree = document#ET.parse("data.xml")
# this element holds the phonebook entries

data={}

cities={}

etnic_groups={}

for element in document_tree.iterfind('country'):
    #print '* ' + element.find('name').text + ':',
    #print ("%s" % (element.findtext(prop)))
    key=element.find('name').text    
    try: val=float(element.findtext("infant_mortality"))
    except: continue
    data[key]=val
    
    for subelement in element.getiterator('city'):
        ckey=subelement.find('name').text
        try: cval=float(subelement.findtext("population"))
        except: continue
        cities[ckey]=cval

    #find latest population
    # First, extract all populations
    year_popul={}
    for subelement in element.getiterator('population'):
        #print subelement.attrib        
        #print int(subelement.attrib['year']), int(subelement.text)
        #print subelement.tag        
        try: 
            key=int(subelement.attrib['year'])
            val=int(subelement.text)
            #print key, val
        except: continue
        year_popul[key]=val
    
      #extract the latest population
    ptemp=sorted( zip(  year_popul.keys(), year_popul.values() ) , key = itemgetter(0) , reverse=True ) 
    try: 
        #print ptemp[0]
        latest_population=float(ptemp[0][1])
    except: continue
    
    
    #add etnic group percentage * latest population    
    for subelement in element.getiterator('ethnicgroup'):
        #print subelement.text, subelement.attrib["percentage"]
        key=subelement.text
        try: 
            perc=float(subelement.attrib["percentage"])
            val=0.01*perc*latest_population
        except: continue
            
        try: 
            curr_val=etnic_groups[key]
            etnic_groups[key]=curr_val+val
        except: etnic_groups[key]=val
        #print key, etnic_groups[key]      
    #ex3=sorted( zip(  etnic_groups.keys(), etnic_groups.values() ) , key = operator.itemgetter(1) , reverse=True )
    #print ex3[0]


#print data
ex1=sorted( zip(  data.keys(), data.values() ) , key = itemgetter(1) , reverse=False ) 
print ('''\n 10 countries with the lowest infant mortality rates''')
print (ex1[0:9])

ex2=sorted( zip(  cities.keys(), cities.values() ) , key = itemgetter(1) , reverse=True ) 
print ('''\n 10 cities with the largest population''')
print (ex2[0:9])

ex3=sorted( zip(  etnic_groups.keys(), etnic_groups.values() ) , key = itemgetter(1) , reverse=True ) 
print ('''\n 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries''')
print (ex3[0:9])


 10 countries with the lowest infant mortality rates
[('Monaco', 1.81), ('Japan', 2.13), ('Norway', 2.48), ('Bermuda', 2.48), ('Singapore', 2.53), ('Sweden', 2.6), ('Czech Republic', 2.63), ('Hong Kong', 2.73), ('Macao', 3.13)]

 10 cities with the largest population
[('Seoul', 10229262.0), ('Mumbai', 9925891.0), ('São Paulo', 9412894.0), ('Jakarta', 8259266.0), ('Shanghai', 8205598.0), ('Ciudad de México', 8092449.0), ('Moskva', 8010954.0), ('Tokyo', 7843000.0), ('Beijing', 7362426.0)]

 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries
[('Malay', 89413964.576), ('Eastern Hamitic', 82830376.53), ('Viet/Kinh', 76078375.30000001), ('Thai', 51084156.45), ('Arab-Berber', 50583952.045), ('Arab', 42402739.5), ('African', 40986981.3645), ('Mangbetu-Azande', 27986022.45), ('Han Chinese', 27175500.0)]
