# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [8]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [13]:
# print names of all countries
for child in document_tree.getroot():
    print( child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [15]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print( '* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print( capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


In [4]:
import pandas as pd

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [3]:
document = ET.parse( './data/mondial_database.xml' )

In [39]:
import pandas as pd
i = 0
df = pd.DataFrame(columns=['country', 'mort'])
for country in document.iterfind(path='./country'):
    for mort in country.getiterator('infant_mortality'):
        df.loc[i] = [country.find('name').text, mort.text]
        i += 1
    

df.head()
df.mort = df.mort.astype(dtype='float')

Unnamed: 0,country,mort
0,Albania,13.19
1,Greece,4.78
2,Macedonia,7.9
3,Serbia,6.16
4,Andorra,3.69


In [45]:
df.sort_values(by='mort').head(10)

Unnamed: 0,country,mort
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


In [65]:
# build dataframe for the population in the cities
citypop = pd.DataFrame(columns=['city', 'population'])
i = 0
for element in document.iterfind('country'):
    for subelement in element.getiterator('city'):
        city = subelement.find('name').text 
        pop = 0
        for ipop in subelement.getiterator('population'):
            vpop = int(ipop.text)
            if pop <  vpop:
                pop  = vpop
        citypop.loc[i] = [city, pop]
        i += 1
        


Unnamed: 0,city,population
0,Tirana,418495.0
1,Shkodër,77075.0
2,Durrës,113249.0
3,Vlorë,79513.0
4,Elbasan,78703.0


In [68]:
citypop.sort_values(by='population', ascending=False).head(10)

Unnamed: 0,city,population
1341,Shanghai,22315474.0
771,Istanbul,13710512.0
1582,Delhi,12877470.0
1527,Mumbai,12442373.0
479,Moskva,11979529.0
1340,Beijing,11716620.0
2810,São Paulo,11152344.0
1342,Tianjin,11090314.0
1064,Guangzhou,11071424.0
1067,Shenzhen,10358381.0


In [12]:
ethnicgroups = pd.DataFrame(columns=['country', 'ethnicity', 'population'])

i = 0

for country in document.iterfind('country'):
    yearv = 0
    popnm = 0
    for popu in country.iterfind('population'):
        yearvc = int(popu.get('year'))
        popn = int(popu.text)
        if yearv < yearvc :
            yearv = yearvc
            popnm = popn
    for etnos in country.iterfind('ethnicgroup'):
        perc = float(etnos.get('percentage'))
        etname = etnos.text
        ethnicgroups.loc[i] = [country.find('name').text, etname, perc*popnm*0.01]
        i += 1

In [22]:
val = ethnicgroups.groupby('ethnicity').sum()
val.sort_values(by='population', ascending=False).head(10)

Unnamed: 0_level_0,population
ethnicity,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


In [45]:
max_river = 0
country = 'none'
river_name = ''
for river in document.findall('./river'):
    if river.find('length') is None :
        continue
    river_length = float(river.find('length').text)
    if max_river < river_length:
        max_river = river_length
        country = river.get('country')
        river_name = river.find('name').text
print(country, river_name)

CO BR PE Amazonas


In [47]:
area = 0
lake_name = ''
countryn = ''
for lake in document.iterfind('./lake'):
    if lake.find('area') is None :
        continue
    lake_area = float(lake.find('area').text)
    if area < lake_area:
        area = lake_area
        lake_name = lake.find('name').text
        countryn = lake.get('country')

print(countryn, lake_name)

R AZ KAZ IR TM Caspian Sea


In [49]:
airportn = ''
elevation = 0
country = ''

for airport in document.iterfind('./airport'):
    if airport.find('elevation') is None or airport.find('elevation').text is None:
        continue
    elev = float(airport.find('elevation').text)
    if elevation < elev:
        elevation = elev
        country = airport.get('country')
        airportn = airport.find('name').text
print(country, airportn)

BOL El Alto Intl
