# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [25]:
from xml.etree import ElementTree as ET
import operator

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [7]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [13]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [14]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [17]:
document = ET.parse( './data/mondial_database.xml' )

In [15]:
for childs in document_tree.getroot():
    print (childs.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


# 10 countries with the lowest infant mortality rates

In [27]:
infant_data = {}
for country in document.iterfind('country'):
    if country.find('infant_mortality') is None:
        pass
    else: 
        name = country.find('name').text
        im = country.find('infant_mortality').text
        infant_data[name] = float(im)
sorted_dict = sorted(infant_data.items(), key=operator.itemgetter(1))
print (sorted_dict[:10])

[('Monaco', 1.81), ('Japan', 2.13), ('Norway', 2.48), ('Bermuda', 2.48), ('Singapore', 2.53), ('Sweden', 2.6), ('Czech Republic', 2.63), ('Hong Kong', 2.73), ('Macao', 3.13), ('Iceland', 3.15)]


In [42]:
popdata = {}
for country in document.iterfind('country'):
    if country.find("population[@year='2000']") is None:
        pass
    else:
        name = country.find('name').text
        pop = country.find("population[@year='2000']").text
        popdata[name] = int(pop)
sorted_dict = sorted(popdata.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_dict[:10])

[('China', 1242612226), ('United States', 281414181), ('Indonesia', 205132458), ('Brazil', 169799170), ('Russia', 146762881), ('Pakistan', 143832014), ('Japan', 125714674), ('Mexico', 97483412), ('Philippines', 76506928), ('Turkey', 67808719)]


# 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [49]:
ethinicdata = {}
for country in document.iterfind('country'):
    if country.find('./ethnicgroup[1][@percentage]') is None:
        pass
    else:
        lastpop = int(country.find('./population[last()]').text)
        ethnic = country.find('./ethnicgroup[1]')
        ethnicname = ethnic.text
        ethicperc = float(ethnic.get('percentage'))/100
        ethinicdata[country.find('name').text, ethnicname] = lastpop * ethicperc
  
sorted_dict = sorted(ethinicdata.items(), key=operator.itemgetter(1), reverse=True)
sorted_d[:10]

[(('China', 'Han Chinese'), 1245058800.0),
 (('India', 'Dravidian'), 302713744.25),
 (('United States', 'European'), 254958101.97759998),
 (('Nigeria', 'African'), 162651570.84),
 (('Bangladesh', 'Bengali'), 146776916.72),
 (('Japan', 'Japanese'), 126534212.00000001),
 (('Russia', 'Russian'), 114646210.938),
 (('Indonesia', 'Javanese'), 113456006.10000001),
 (('Brazil', 'European'), 108886717.794),
 (('Vietnam', 'Viet/Kinh'), 76078375.3)]

# name and country of  a) longest river

In [54]:
riverdata = {}
for river in document.iterfind('river'):
    name = river.get('id')
    country = river.get('country')
    length = river.find('./length')
    if length is None:
        pass
    else:
        #print(float(length.text))
        riverdata[name, country]=float(length.text)
    
sorted_d = sorted(riverdata.items(), key=operator.itemgetter(1), reverse=True)
sorted_d[:1]

[(('river-Amazonas', 'CO BR PE'), 6448.0)]

# b) largest lake

In [56]:
lakedata = {}
for lake in document.iterfind('lake'):
    name = lake.get('id')
    country = lake.get('country')
    area = lake.find('./area')
    if area is None:
        pass
    else:
        #print float(area.text)
        lakedata[name, country]=float(area.text)

sorted_d = sorted(lakedata.items(), key=operator.itemgetter(1), reverse=True)
sorted_d[:1]


[(('lake-KaspischesMeer', 'R AZ KAZ IR TM'), 386400.0)]

# c) airport at highest elevation

In [61]:
airportdata = {}
for airport in document.iterfind('airport'):
    name = airport.get('iatacode')
    country = airport.get('country')
    elevation = airport.findtext('./elevation')
    if elevation is None:
        pass
    elif elevation=='':
        pass
    else:
        #print(float(elevation))
        airportdata[name, country]=float(elevation)

sorted_d = sorted(airportdata.items(), key=operator.itemgetter(1), reverse=True)
sorted_d[:1]    


[(('LPB', 'BOL'), 4063.0)]