# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [74]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [75]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [76]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [77]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':', end=''),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:Skopje, Kumanovo
* Serbia:Beograd, Novi Sad, Niš
* Montenegro:Podgorica
* Kosovo:Prishtine
* Andorra:Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [78]:
document = ET.parse( './data/mondial_database.xml' )

In [89]:
# print child and attributes
#for child in document.getroot():
#    print (child.tag, child.attrib)

In [80]:
import pandas as pd

In [84]:
# Create a list of country and their Infant Mortality Rate 
country_imr=[]
for country in document.getroot().findall('country'):
    name = country.find('name').text
    infant_mortality_rate = country.find('infant_mortality')
    if infant_mortality_rate is not None:
        infant_mortality_rate=infant_mortality_rate.text
    else :
        infant_mortality_rate = -1
    country_imr.append((name, (float)(infant_mortality_rate)))

## 10 countries with the lowest infant mortality rates

In [88]:
df = pd.DataFrame(country_imr, columns=['Country', 'Infant_Mortality_Rate'])
df_unknown_removed = df[df.Infant_Mortality_Rate != -1] 
df_unknown_removed.set_index('Infant_Mortality_Rate').sort().head(10)

Unnamed: 0_level_0,Country
Infant_Mortality_Rate,Unnamed: 1_level_1
1.81,Monaco
2.13,Japan
2.48,Bermuda
2.48,Norway
2.53,Singapore
2.6,Sweden
2.63,Czech Republic
2.73,Hong Kong
3.13,Macao
3.15,Iceland


In [149]:
city_population=[]
for country in document.iterfind('country'):
    for state in country.iterfind('province'):
        for city in state.iterfind('city'):
            try:
                city_population.append((city.find('name').text, float(city.find('population').text)))
            except:
                next
    for city in country.iterfind('city'):
        try:
            city_population.append((city.find('name').text, float(city.find('population').text)))
        except:
            next

## 10 cities with the largest population

In [165]:
df = pd.DataFrame(city_population, columns=['City', 'Population'])
#df.info()
df.sort_index(by='Population', ascending=False).head(10)

Unnamed: 0,City,Population
1763,Seoul,10229262
1421,Mumbai,9925891
2594,São Paulo,9412894
1629,Jakarta,8259266
1251,Shanghai,8205598
1942,Ciudad de México,8092449
443,Moskva,8010954
1725,Tokyo,7843000
1250,Beijing,7362426
1467,Delhi,7206704


In [229]:
ethnic_population={}
country_population={}
for country in document.iterfind('country'):
    try:
        country_population[country.find('name').text]= float(country.find('population').text)
    except:
        next
    for state in country.iterfind('province' or 'state'):
        try:
            country_population[country.find('name').text] += float(state.find('population').text)
        except:
            next
        for city in state.iterfind('city'):
            try:
                country_population[country.find('name').text] += float(city.find('population').text)
            except:
                next

for country in document.iterfind('country'):
    for ethnicgroup in country.iterfind('ethnicgroup'):
        try:
            if ethnicgroup.text in ethnic_population:
                ethnic_population[ethnicgroup.text] += country_population[country.find('name').text]*float(ethnicgroup.get('percentage'))/100
            else:
                ethnic_population[ethnicgroup.text] = country_population[country.find('name').text]*float(ethnicgroup.get('percentage'))/100
        except:
            next

## 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [241]:
pd.DataFrame(sorted(ethnic_population.items(), key=lambda x:x[1], reverse=True)[:10], columns=['Ethnic_Groups', 'Population'])

Unnamed: 0,Ethnic_Groups,Population
0,Han Chinese,1593119000.0
1,Indo-Aryan,777635700.0
2,European,666851200.0
3,African,289667800.0
4,Russian,270558300.0
5,Dravidian,270012400.0
6,Japanese,250637100.0
7,German,170635800.0
8,Mestizo,166613900.0
9,Javanese,141317900.0


In [243]:
rivers_list=[]
rivers_df = pd.DataFrame()
for rivers in document.iterfind('river'):
    try:
        rivers_list.append({'name':rivers.find('name').text, 'length':rivers.find('length').text, 'country':rivers.find('located').attrib['country']})
    except:
        next
rivers_list

[{'country': 'N', 'length': '604', 'name': 'Glomma'},
 {'country': 'N', 'length': '322', 'name': 'Lagen'},
 {'country': 'S', 'length': '93', 'name': 'Goetaaelv'},
 {'country': 'S', 'length': '460', 'name': 'Klaraelv'},
 {'country': 'S', 'length': '470', 'name': 'Umeaelv'},
 {'country': 'S', 'length': '520', 'name': 'Dalaelv'},
 {'country': 'S', 'length': '320', 'name': 'Vaesterdalaelv'},
 {'country': 'S', 'length': '241', 'name': 'Oesterdalaelv'},
 {'country': 'SF', 'length': '145', 'name': 'Paatsjoki'},
 {'country': 'SF', 'length': '300', 'name': 'Ounasjoki'},
 {'country': 'SF', 'length': '550', 'name': 'Kemijoki'},
 {'country': 'SF', 'length': '107', 'name': 'Oulujoki'},
 {'country': 'SF', 'length': '203', 'name': 'Kymijoki'},
 {'country': 'SF', 'length': '121', 'name': 'Kokemaeenjoki'},
 {'country': 'SF', 'length': '162', 'name': 'Vuoksi'},
 {'country': 'GB', 'length': '346', 'name': 'Thames'},
 {'country': 'NL', 'length': '925', 'name': 'Maas'},
 {'country': 'F', 'length': '1013', 