# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )

### Question 1

In [6]:
inf_mort = {}
for element in document.iterfind('country'):
    try:
         inf_mort[element.find('name').text] = element.find('infant_mortality').text
    except AttributeError:
        continue

In [7]:
import pandas as pd

inf_mort = pd.DataFrame.from_dict(inf_mort, orient='index')
inf_mort.columns = ['infant_mortality']
inf_mort['infant_mortality'] = inf_mort['infant_mortality'].astype('float')

In [8]:
inf_mort.sort_values('infant_mortality').head(10)

Unnamed: 0,infant_mortality
Monaco,1.81
Japan,2.13
Norway,2.48
Bermuda,2.48
Singapore,2.53
Sweden,2.6
Czech Republic,2.63
Hong Kong,2.73
Macao,3.13
Iceland,3.15


### Question 2

In [36]:
cities = []

for element in document.iterfind('country'):
    if element.findall('city'):
        for city in element.findall('city'):
            if city.findall('population'):
                 cities.append({'city': city.find('name').text,
                                'population': int(city.findall('population')[-1].text)})
cities = pd.DataFrame(cities)
cities.sort_values('population', ascending=False).head(10)

Unnamed: 0,city,population
165,Seoul,9708483
154,Al Qahirah,8471859
75,Bangkok,7506700
123,Hong Kong,7055071
87,Ho Chi Minh,5968384
201,Singapore,5076700
153,Al Iskandariyah,4123869
205,New Taipei,3939305
166,Busan,3403135
102,Pyongyang,3255288


### Question 3

In [67]:
data = []
for element in document.findall('country'):
    if element.findall('population'):
        country = element.find('name').text
        population = int(element.findall('population')[-1].text)
    if element.findall('ethnicgroup'):
        for x in element.findall('ethnicgroup'):
            data.append({'country': country,
                         'percentage': float(x.attrib['percentage'])/100, 
                         'ethnicgroup': x.text,
                         'population': float(population)})
ethnicpop = pd.DataFrame(data)
ethnicpop['ethnicpop'] = ethnicpop['percentage'] * ethnicpop['population']
ethnicpop.groupby('ethnicgroup').sum()['ethnicpop'].sort_values(ascending=False).head(10)

ethnicgroup
Han Chinese    1.245059e+09
Indo-Aryan     8.718156e+08
European       4.948722e+08
African        3.183251e+08
Dravidian      3.027137e+08
Mestizo        1.577344e+08
Bengali        1.467769e+08
Russian        1.318570e+08
Japanese       1.265342e+08
Malay          1.219936e+08
Name: ethnicpop, dtype: float64

### Question 4

Part A

In [89]:
rivers = []
for river in document.findall('river'):
    if river.find('length') is not None:
        rivers.append({
                'name': river.find('name').text,
                'length': float(river.find('length').text),
                'country': river.find('source').attrib['country']
            })
rivers = pd.DataFrame(rivers)
rivers.sort_values('length', ascending=False).head(1)

Unnamed: 0,country,length,name
174,PE,6448,Amazonas


Part B

In [101]:
lakes = []
for lake in document.findall('lake'):
    if lake.find('area') is not None:
        lakes.append({
                'name': lake.find('name').text,
                'area': float(lake.find('area').text),
                'country': lake.attrib['country']
            })
lakes = pd.DataFrame(lakes)
lakes.sort_values('area', ascending=False).head(1)

Unnamed: 0,area,country,name
54,386400,R AZ KAZ IR TM,Caspian Sea


Part C

In [107]:
airports = []
for airport in document.findall('airport'):
    if airport.find('elevation').text is not None:
        airports.append({
                'name': airport.find('name').text,
                'elevation': float(airport.find('elevation').text),
                'country': airport.attrib['country']
            })
airports = pd.DataFrame(airports)
airports.sort_values('elevation', ascending=False).head(1)

Unnamed: 0,country,elevation,name
80,BOL,4063,El Alto Intl
