# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [28]:
# import them libraries
from xml.etree import ElementTree as ET
import pandas as pd

# parse them xml
document = ET.parse( './data/mondial_database.xml' )

In [35]:
# QUESTION 1: 10 countries with the lowest infant mortality rates

# initialize lists
cty = []
imt = []

# loop through the countries in xml, check if it has 'infant_mortality' element, 
# insert the country and value into separate lists.
for country in document.iter('country'):
    i_mortality = country.find('infant_mortality')
    if i_mortality is not None:
        cty.append(country.find('name').text)
        imt.append(float(country.find('infant_mortality').text))

# we create a dataframe from the lists and sort it from lowest
infant_mortality = pd.DataFrame(imt, index=cty, columns=['infant_mortality'])        
infant_mortality.sort('infant_mortality', ascending=True).head(10)

Unnamed: 0,infant_mortality
Monaco,1.81
Japan,2.13
Bermuda,2.48
Norway,2.48
Singapore,2.53
Sweden,2.6
Czech Republic,2.63
Hong Kong,2.73
Macao,3.13
Iceland,3.15


In [64]:
# QUESTION 2: 10 cities with the largest population

# initialize them lists again...
ct  = []
pop = []

# loop through the countries, get all the cities
for country in document.iter('country'):
    cities = country.findall('city')
    for city in cities:
        if city is not None: 
            # since there are multiple 'population' elements,
            # i decided to take only population at year 2011
            population = city.find("population[@year='2011']")
            if population is not None:
                # put them values in the list
                ct.append(city.find('name').text)
                pop.append(int(population.text))
            
# put them list in dataframe, sort it by population, and take the top 10
population_df = pd.DataFrame(pop, index=ct, columns=['population'])
population_df.sort('population', ascending=False).head(10)

Unnamed: 0,population
Beograd,1639121
Montevideo,1318755
Sofia,1270284
Yerevan,1060138
Kathmandu,1003285
Zagreb,686568
Kingston,662426
Rīga,658640
Vilnius,535631
Dublin,525383


In [65]:
# Question 3

keyword    = ['estimate', 'est.']
ethnic_col = ['name', 'ethnicPopulation', 'group']
ethnic_df  = pd.DataFrame(columns=ethnic_col)

for countries in document.iterfind('country'):
    # re-initialise year variable to compare latest year for each country
    latest_year = 0
    highest_per = 0
    
    # if country has no ethnicgroup, will skip that country then
    if countries.find('ethnicgroup') is not None:
        for perc in countries.iterfind('ethnicgroup'):
            if perc.attrib['percentage'] > highest_per:
                highest_per = perc.attrib['percentage']
                
        highest_group = countries.find("ethnicgroup[@percentage='"+highest_per+"']").text
        # print countries.find('name').text + ' ' + highest_per
        # loop through the keyword dictionary to get estimated population element
        # that has different keywords.
        for key in keyword:
            for est_pop in countries.iterfind("population[@measured='"+key+"']"):
                if est_pop is not None and est_pop.attrib['year'] > latest_year:
                    latest_year  = est_pop.attrib['year']
                    estimate_pop = est_pop.text # always update the estimated population for latest year
        
        # calculate the ethnic population by getting the latest estimate population and highest ethnic percentage
        ethnic_pop = int(int(estimate_pop) * float(highest_per) / 100)
        
        # create a new entry and append to the DataFrame
        entry = pd.DataFrame([[countries.find('name').text, int(ethnic_pop), highest_group]], columns=ethnic_col) 
        ethnic_df = ethnic_df.append(entry, ignore_index=True)

# sort the DataFrame and get the top 10
ethnic_df.sort('ethnicPopulation', ascending=False).head(10)

Unnamed: 0,name,ethnicPopulation,group
44,China,1245058800,Han Chinese
97,United States,254958101,European
81,Japan,126534212,Japanese
21,Russia,114646210,Russian
165,Nigeria,94661176,African
23,Luxembourg,90653833,Luxembourgish
22,Belgium,83326819,Fleming
72,Egypt,82830376,Eastern Hamitic
62,Vietnam,76078375,Viet/Kinh
95,British Virgin Islands,71788221,Black


In [107]:
# Question 4

r_length = 0  # river length
l_area   = 0  # lake area
a_elev   = 0  # airport elevation

# get the longest river
for rivers in document.iterfind('river'):
    # compare the longest river. make sure all rivers have length
    if rivers.find('length') is not None:
        if float(rivers.find('length').text) > float(r_length):
            r_name   = rivers.find('name').text
            r_length = rivers.find('length').text
            r_loc    = rivers.attrib['country'].split()

riverCountryList = 'Country: '
for cty in r_loc:
    thisCountry = document.find("country[@car_code='"+cty+"']")
    if thisCountry is not None:
        riverCountryList += thisCountry.find('name').text + ' | '

        
# get the largest lake            
for lakes in document.iterfind('lake'):
    # compare the largest lake using area. make sure all lakes have area
    if lakes.find('area') is not None and float(lakes.find('area').text) > float(l_area):
        l_name = lakes.find('name').text
        l_area = lakes.find('area').text
        l_loc  = lakes.attrib['country'].split()  # split multiple countries from string to list

# from the country code we collect, get the country name
lakeCountryList = 'Country: '
for cty in l_loc:
    thisCountry = document.find("country[@car_code='"+cty+"']")
    if thisCountry is not None:
        lakeCountryList += thisCountry.find('name').text + ' | '
        
        
# get the highest elevated airport
for airports in document.iterfind('airport'):
    # compare the highest elevation using elevation. make sure all airports have elevation and is a digit
    thisElev = airports.find('elevation').text
    
    if thisElev is not None and thisElev.isdigit() and float(thisElev) > float(a_elev):
        a_name = airports.find('name').text
        a_elev = airports.find('elevation').text
        a_loc  = airports.attrib['country'].split()

airCountryList = 'Country: '
for cty in a_loc:
    thisCountry = document.find("country[@car_code='"+cty+"']")
    if thisCountry is not None:
        airCountryList += thisCountry.find('name').text

print r_name + '-' + r_length + 'KM ' + riverCountryList
print l_name + '-' + l_area + 'KM ' + lakeCountryList
print a_name + '-' + a_elev + 'M ' + airCountryList


Amazonas-6448KM Country: Colombia | Brazil | Peru | 
Caspian Sea-386400KM Country: Russia | Azerbaijan | Kazakhstan | Iran | Turkmenistan | 
El Alto Intl-4063M Country: Bolivia
