# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)



Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
document = ET.parse( './data/mondial_database.xml' )

In [7]:
import pandas as pd
import numpy as np

In [8]:
#10 countries with the lowest infant mortality rates
countries=[]
infant_mortality=[]
for element in document.iterfind('country'):
    if (element.find('name') is not None ):  
        countries.append(element.find('name').text)
        if (element.find('infant_mortality') is not None):
            infant_mortality.append(element.find('infant_mortality').text)
        else:
            infant_mortality.append(np.nan)

data = pd.DataFrame()
data['country']=countries
data['infant_mortality_rate']=infant_mortality
data['infant_mortality_rate']=data['infant_mortality_rate'].astype(float)

data.sort_values(by='infant_mortality_rate').head(10)

Unnamed: 0,country,infant_mortality_rate
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [9]:
#10 cities with the largest population
city=[]
population=[]

for element in document.iterfind('country'):
    for subelement in element.getiterator('city'):
        city.append(subelement.find('name').text)
        populate=''
        for node in subelement.iter('population'):
            populate=node.text
        population.append(populate)
       

        
data = pd.DataFrame()
data['city']=city
data['population']=population
data['population']=data['population'].replace('',np.nan)
data['population']=data['population'].astype(float)
data.sort_values(by='population', ascending=False).head(10)        


Unnamed: 0,city,population
1341,Shanghai,22315474
771,Istanbul,13710512
1527,Mumbai,12442373
479,Moskva,11979529
1340,Beijing,11716620
2810,São Paulo,11152344
1342,Tianjin,11090314
1064,Guangzhou,11071424
1582,Delhi,11034555
1067,Shenzhen,10358381


In [38]:
#10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
d={}

for element in document.iterfind('country'):
    for node in element.iterfind('population'):
        population=node.text      
    for subelement in element.getiterator('ethnicgroup'):        
        if subelement.text in d:
            d[subelement.text]+=float(subelement.attrib['percentage'])*float(population)/100  
        else:
            d[subelement.text]=float(subelement.attrib['percentage'])*float(population)/100  

s = pd.Series(d, name='Population')
s.reset_index()
df=s.to_frame()

pd.set_option('display.precision',15)
df.sort_values(by='Population',ascending=False).head(10)

Unnamed: 0,Population
Han Chinese,1245058800.0
Indo-Aryan,871815583.44
European,494872219.7196
African,318325120.369
Dravidian,302713744.25
Mestizo,157734354.937
Bengali,146776916.72
Russian,131856996.077
Japanese,126534212.0
Malay,121993550.374


In [42]:
#name and country of a) longest river, b) largest lake and c) airport at highest elevation
dict_country={}
river_name=[]
length=[]
country=[]
for element in document.iterfind('country'):
    dict_country[element.attrib['car_code']]=element.find('name').text

#longest river
for element in document.iterfind('river'):    
    river_name.append(element.find('name').text)
    if element.find('length') is not None:
        length.append(element.find('length').text)
    else:
        length.append(np.nan)
    #country.append(element.attrib['country'])
    country.append(element.attrib['country'].split(' '))
    
data = pd.DataFrame()
data['river']=river_name
data['length']=length
data['country']=country
data['length']=data['length'].astype(float)
df=data.sort_values(by='length', ascending=False).head(1)

print('a) Longest river:'+df.river.tolist()[0])
countries=''
for i in range(0,len(df.country.tolist()[0])):
    countries+=dict_country[df.country.tolist()[0][i]]+', '
    
print('Country:'+countries)

#largest lake
lake_name=[]
area=[]
country=[]
for element in document.iterfind('lake'):    
    lake_name.append(element.find('name').text)    
    if element.find('area') is not None:
        area.append(element.find('area').text)        
    else:
        area.append(np.nan)
    country.append(element.attrib['country'].split(' '))
    
data=pd.DataFrame()
data['lake']=lake_name
data['area']=area
data['country']=country
data['area']=data['area'].astype(float)
df=data.sort_values(by='area', ascending=False).head(1)

print('b) Largest lake:'+df.lake.tolist()[0])
countries=''
for i in range(0,len(df.country.tolist()[0])):
    countries+=dict_country[df.country.tolist()[0][i]]+', '
    
print('Country:'+countries)

#airport at highest elevation
airport=[]
elevation=[]
country=[]
for element in document.iterfind('airport'):    
    airport.append(element.find('name').text)    
    if element.find('elevation') is not None:
        elevation.append(element.find('elevation').text)        
    else:
        elevation.append(np.nan)
    country.append(element.attrib['country'].split(' '))
    
data=pd.DataFrame()
data['airport']=airport
data['elevation']=elevation
data['country']=country
data['elevation']=data['elevation'].astype(float)
df=data.sort_values(by='elevation', ascending=False).head(1)

print('c) Highest Airport:'+df.airport.tolist()[0])
countries=''
for i in range(0,len(df.country.tolist()[0])):
    countries+=dict_country[df.country.tolist()[0][i]]+', '
    
print('Country:'+countries)



a) Longest river:Amazonas
Country:Colombia, Brazil, Peru, 
b) Largest lake:Caspian Sea
Country:Russia, Azerbaijan, Kazakhstan, Iran, Turkmenistan, 
c) Highest Airport:El Alto Intl
Country:Bolivia, 


80    4063
Name: elevation, dtype: float64