# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [5]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [6]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [7]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [16]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [37]:
#Question1
import pandas as pd
document = ET.parse( './data/mondial_database.xml' )
root=document.getroot()
country = []
mortality = []
for element in root.iterfind('country'):
    if element.find('infant_mortality') is not None:
        mortality.append(float(element.find('infant_mortality').text))
        country.append(element.find('name').text)
df = pd.DataFrame({'Country': country , 'Infant mortality': mortality })
df.sort_values(['Infant mortality']).head(10)

    

Unnamed: 0,Country,Infant mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


In [39]:
#Question2

city = []
population = []
for element in document.getiterator('city'):
    if element.find('population') is not None:
        city.append(element.find('name').text)
        population.append(float(element.findall('population')[-1].text))
df1 = pd.DataFrame({'City': city , 'Population': population })
df1.sort_values(['Population'], ascending=False).head(10)


Unnamed: 0,City,Population
1251,Shanghai,22315474.0
707,Istanbul,13710512.0
1421,Mumbai,12442373.0
443,Moskva,11979529.0
1250,Beijing,11716620.0
2594,São Paulo,11152344.0
1252,Tianjin,11090314.0
974,Guangzhou,11071424.0
1467,Delhi,11034555.0
977,Shenzhen,10358381.0


In [41]:
#Question3:10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
e_group = []
e_percent = []
country = []
population = []
for element in document.getiterator('country'):
    for i in range(0,len(element.findall('ethnicgroup'))):
        country.append(element.find('name').text)
        population.append(float(element.findall('population')[-1].text))
for element in document.getiterator('ethnicgroup'):
    e_group.append(element.text)
    e_percent.append(float(element.attrib['percentage'])/100)
    
df3 = pd.DataFrame({'Ethnic Group': e_group , 'Ethnic percent': e_percent})
df3['Ethnic population']= df3['Ethnic percent'] * population
df3.groupby('Ethnic Group').sum().sort_values(['Ethnic population'], ascending=False).head(10)
    

Unnamed: 0_level_0,Ethnic percent,Ethnic population
Ethnic Group,Unnamed: 1_level_1,Unnamed: 2_level_1
Han Chinese,0.915,1245059000.0
Indo-Aryan,0.72,871815600.0
European,9.7082,494872200.0
African,18.6855,318325100.0
Dravidian,0.25,302713700.0
Mestizo,8.707,157734400.0
Bengali,0.98,146776900.0
Russian,2.241,131857000.0
Japanese,0.994,126534200.0
Malay,2.423,121993600.0


In [44]:
#Question4   name and country of a) longest river
river_name = []
country_cd = []
river_length = []
country=[]
for element in document.getiterator('river'):
    if element.find('length') is not None:
        river_name.append(element.find('name').text)
        river_length.append(float(element.find('length').text))
        country_cd.append(element.attrib['country'])
df4 = pd.DataFrame({'Length of the river':river_length, 'River':river_name, 'Country':country_cd})

for element in document.getiterator('country'):
    if element.attrib['car_code']=='CO':
        print (element.find('name').text)
    elif element.attrib['car_code']=='BR':
        print (element.find('name').text)
    else:
        if element.attrib['car_code']=='PE':
            print (element.find('name').text)

df4.sort_values(by = 'Length of the river', ascending=False).head(1)
    


Colombia
Brazil
Peru


Unnamed: 0,Country,Length of the river,River
174,CO BR PE,6448.0,Amazonas


In [51]:
 #Question4 b) largest lake
    
lake_name = []
country_cd = []
lake_area = []
country=[]
for element in document.getiterator('lake'):
    if element.find('area') is not None:
        lake_name.append(element.find('name').text)
        lake_area.append(float(element.find('area').text))
        country_cd.append(element.attrib['country'])
df5 = pd.DataFrame({'Area of the lake':lake_area, 'Lake':lake_name, 'Country':country_cd})
    
for element in document.getiterator('country'):
    if element.attrib['car_code']=='R':
        print (element.find('name').text)
    elif element.attrib['car_code']=='AZ':
        print (element.find('name').text)
    elif element.attrib['car_code']=='KAZ':
        print (element.find('name').text)
    elif element.attrib['car_code']=='IR':
        print (element.find('name').text)
    else:
        if element.attrib['car_code']=='TM':
            print (element.find('name').text)

df5.sort_values(by = 'Area of the lake', ascending=False).head(1)


Russia
Iran
Turkmenistan
Azerbaijan
Kazakhstan


Unnamed: 0,Area of the lake,Country,Lake
54,386400.0,R AZ KAZ IR TM,Caspian Sea


In [52]:
#Question4 c) airport at highest elevation

airport=[]
elev=[]
country_cd=[]
for element in document.getiterator('airport'):
    if element.find('elevation').text is not None:
        airport.append(element.find('name').text)
        elev.append(float(element.find('elevation').text))
        country_cd.append(element.attrib['country'])
df6=pd.DataFrame({'Aiport':airport,'Elevation':elev,'Country':country_cd})
for element in document.getiterator('country'):
    if element.attrib['car_code']=='BOL':
        print (element.find('name').text)

df6.sort_values(by = 'Elevation', ascending=False).head(1)

Bolivia


Unnamed: 0,Aiport,Country,Elevation
80,El Alto Intl,BOL,4063.0
