# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [85]:
from xml.etree import ElementTree as ET
from collections import Counter

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
document = ET.parse( './data/mondial_database.xml' )

In [154]:
#top ten highest infant mortality rates by country
def top_mort(document):
    mort_rates = {}
    for element in document.getroot().iterfind('country'):
        if element.find('infant_mortality') is not None:
             mort_rates[element.find('name').text] = float(element.find('infant_mortality').text)
    #imported python Counter module for ease in finding top values
    #will use counter for rest of assignment
    print dict(Counter(mort_rates).most_common(10))

top_mort(document)


{'Afghanistan': 117.23, 'Angola': 79.99, 'Chad': 90.3, 'Western Sahara': 145.82, 'Central African Republic': 92.86, 'Somalia': 100.14, 'Guinea-Bissau': 90.92, 'Mali': 104.34, 'Niger': 86.27, 'Burkina Faso': 76.8}


In [153]:
#top ten most populous cities
def city_pop(document):
    cities = {}
    for element in document.iterfind('country'):
    #initially i stored the dict {country : {city: population}}, but I decided to just store city pop for ease of computation
        for subelement in element.getiterator('city'):
            if subelement.find('population') is not None: # and int(subelement.find('population').attrib['year']) > 2000:
                #grabbing the last population sub element and setting it for the city name in the dict
                cities[subelement.find('name').text] = int(subelement.findall('population')[-1].text)
            #discard cities without population values
            #else:
              #  countries[element.find('name').text][subelement.find('name').text] = 'hi'
    print dict(Counter(cities).most_common(10))

#could pretty print to chart wnd take care of UTF formatting issues    
city_pop(document)

{u'S\xe3o Paulo': 11152344, 'Guangzhou': 11071424, 'Istanbul': 13710512, 'Beijing': 11716620, 'Tianjin': 11090314, 'Shanghai': 22315474, 'Delhi': 11034555, 'Mumbai': 12442373, 'Shenzhen': 10358381, 'Moskva': 11979529}


In [151]:

#top ten ethnicities in the world by total population
def total_ethnic(document):
    total_ethnic = {}
    for element in document.iterfind('country'):
        population = int(element.findall('population')[-1].text)
        #get the percentages for each ethnic group in the population, 
        #get real numbers from population, add to global dict
        for sub in element.getiterator('ethnicgroup'):
            if sub.text in total_ethnic:
                total_ethnic[sub.text] += (float(sub.attrib['percentage']) * population) / 100.0
            else:
                total_ethnic[sub.text] = (float(sub.attrib['percentage']) * population) / 100.0
    print dict(Counter(total_ethnic).most_common(10))
            
total_ethnic(document)

{'Han Chinese': 1245058800.0, 'Japanese': 126534212.0, 'Mestizo': 157734354.93699998, 'Malay': 121993550.374, 'Bengali': 146776916.72, 'African': 318325120.369, 'Russian': 131856996.077, 'Dravidian': 302713744.25, 'Indo-Aryan': 871815583.44, 'European': 494872219.71959996}


In [150]:
def top_attributes(document):
    top_river = (0, '')
    top_lake = (0, '')
    top_airport = (0, '')
    for element in document.iterfind('river'):
        if element.find('length') is not None and float(element.find('length').text) >= top_river[0]:
            top_river = (float(element.find('length').text), element.find('name').text, element.attrib['country'])
    print top_river    
    for element in document.iterfind('lake'):
        if element.find('area') is not None and float(element.find('area').text) >= top_lake[0]:
            top_lake = (float(element.find('area').text), element.find('name').text, element.attrib['country'])
    print top_lake   
    for element in document.iterfind('airport'):
        #apparently some elevation tags don't have a text element
        if element.find('elevation') is not None and element.find('elevation').text and int(element.find('elevation').text) >= top_airport[0]:
            top_airport = (float(element.find('elevation').text), element.find('name').text, element.attrib['country'])
    print top_airport
    
#can pretty print output, but it is (measurement, name, countries) in the output    
top_attributes(document)

(6448.0, 'Amazonas', 'CO BR PE')
(386400.0, 'Caspian Sea', 'R AZ KAZ IR TM')
(4063.0, 'El Alto Intl', 'BOL')
