# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [24]:
# print names of all countries
for child in document_tree.iterfind('country'):
    print child.find('name').text


Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [26]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [25]:
document = ET.parse( './data/mondial_database.xml' )

In [52]:
# Number 1 
for country in document.iterfind('country'):
    if country.find('infant_mortality') is None:
        pass
    elif float(country.find('infant_mortality').text)<3.3:
        print country.find('name').text + ": " + country.find('infant_mortality').text
# This seems not very useful as I had to guess what value the infant_mortality had to be less than and it's not ordered

Czech Republic: 2.63
Norway: 2.48
Sweden: 2.6
Monaco: 1.81
Iceland: 3.15
Hong Kong: 2.73
Macao: 3.13
Japan: 2.13
Singapore: 2.53
Bermuda: 2.48


In [261]:
import pandas as pd
a = []
b = []
for country in document.getiterator('country'):
    if country.find('infant_mortality') is not None:
        a.append(country.find('name').text)
        b.append(float(country.find('infant_mortality').text))      
df = pd.DataFrame({'country': a , 'mortality': b })
df.set_index('country').mortality.sort_values().head(10)
# I think this one is more useful overall

country
Monaco            1.81
Japan             2.13
Bermuda           2.48
Norway            2.48
Singapore         2.53
Sweden            2.60
Czech Republic    2.63
Hong Kong         2.73
Macao             3.13
Iceland           3.15
Name: mortality, dtype: float64

In [262]:
# Number 2
city = []
pop = []
for element in document.getiterator('city'):
    if element.find('population') is not None:
        city.append(element.find('name').text)
        pop.append(float(element.findall('population')[-1].text))
df2 = pd.DataFrame({'city': city , 'population': pop })
df2.set_index('city').population.sort_values(ascending=False).head(10)

city
Shanghai     22315474
Istanbul     13710512
Mumbai       12442373
Moskva       11979529
Beijing      11716620
São Paulo    11152344
Tianjin      11090314
Guangzhou    11071424
Delhi        11034555
Shenzhen     10358381
Name: population, dtype: float64

In [302]:
# Number 3
ethnic_group = []
ethnic_percent = []
country = []
population = []
for element in document.getiterator('country'):
    for i in range(0,len(element.findall('ethnicgroup'))):
        country.append(element.find('name').text)
        population.append(float(element.findall('population')[-1].text))
for element in document.getiterator('ethnicgroup'):
    ethnic_group.append(element.text)
    ethnic_percent.append(float(element.attrib['percentage'])/100)
    
df3 = pd.DataFrame({'country':country,'ethnic_group': ethnic_group , 'ethnic_percent': ethnic_percent,'total_population':population })
df3['ethnic_population']= df3.ethnic_percent * df3.total_population
df3.groupby('ethnic_group').sum().ethnic_population.sort_values(ascending=False).head(10)

ethnic_group
Han Chinese    1.245059e+09
Indo-Aryan     8.718156e+08
European       4.948722e+08
African        3.183251e+08
Dravidian      3.027137e+08
Mestizo        1.577344e+08
Bengali        1.467769e+08
Russian        1.318570e+08
Japanese       1.265342e+08
Malay          1.219936e+08
Name: ethnic_population, dtype: float64

In [434]:
# Number 4a
river_name=[]
river_length=[]
country_code = []

for element in document.getiterator('river'):
    if element.find('length') is not None:
        river_name.append(element.find('name').text)
        river_length.append(float(element.find('length').text))
        country_code.append(element.attrib['country'])

df4a = pd.DataFrame({'river_name':river_name,'river_length': river_length,'country_code':country_code})
print df4a.sort_values('river_length',ascending=False).head(1)

for element in document.getiterator('country'):
    if (element.attrib['car_code']=='CO') | (element.attrib['car_code']=='BR')| (element.attrib['car_code']=='PE'):
        print element.find('name').text,

    country_code  river_length river_name
174     CO BR PE          6448   Amazonas
Colombia Brazil Peru


In [439]:
# Number 4b
lake_name=[]
lake_area=[]
country_code = []
for element in document.getiterator('lake'):
    if element.find('area') is not None:
        lake_name.append(element.find('name').text)
        lake_area.append(float(element.find('area').text))
        country_code.append(element.attrib['country'])
df4b = pd.DataFrame({'lake_name':lake_name,'lake_area': lake_area,'country_code':country_code})
print df4b.sort_values('lake_area',ascending=False).head(1)

for element in document.getiterator('country'):
    if (element.attrib['car_code']=='R') | (element.attrib['car_code']=='KAZ')| (element.attrib['car_code']=='IR')\
    | (element.attrib['car_code']=='TM')| (element.attrib['car_code']=='AZ'):
        print element.find('name').text,

      country_code  lake_area    lake_name
54  R AZ KAZ IR TM     386400  Caspian Sea
Russia Iran Turkmenistan Azerbaijan Kazakhstan


In [443]:
# Number 4c 
airport_name=[]
airport_elevation=[]
country_code = []
for element in document.getiterator('airport'):
    if element.find('elevation').text is not None:
        airport_name.append(element.find('name').text)
        airport_elevation.append(float(element.find('elevation').text))
        country_code.append(element.attrib['country'])
df4c = pd.DataFrame({'airport_name':airport_name,'airport_elevation': airport_elevation,'country_code':country_code})
print df4c.sort_values('airport_elevation',ascending=False).head(1)

for element in document.getiterator('country'):
    if (element.attrib['car_code']=='BOL'):
        print element.find('name').text,

    airport_elevation  airport_name country_code
80               4063  El Alto Intl          BOL
Bolivia
