# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [8]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [9]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [13]:
# print names of all countries
for child in document_tree.getroot():
     print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [15]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
document = ET.parse( './data/mondial_database.xml' )

## Create Dataframe from Elements

In [64]:
import pandas as pd
df = pd.DataFrame(columns=('name','infant_mortality'))
root= document.getroot()

#for element in document_tree.iterfind('country'):
for i in range(0,len(root)):
    element=root[i]
    if(element.tag=='country'):
        #obj = root[i] #.getchildren()[i].getchildren()
        #print(element)
        #print(element.find('infant_mortality'))
        if (element.find('infant_mortality')!=None):
            row = dict(zip(['name','infant_mortality'], [element.find('name').text,  float(element.find('infant_mortality').text) ]))
            row_s = pd.Series(row)
            row_s.name = i
            df = df.append(row_s)


## Ten countries lowest infant mortality rates

In [65]:
df.sort_values(by='infant_mortality',ascending=True).head(10)

Unnamed: 0,name,infant_mortality
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


## Create list of city Populations for the most recent measured date

In [92]:
cities=[];
for element in document.iterfind('country'):
    for subelement in element.getiterator('city'):
        name = subelement.find('name').text
        maxYear=0
        maxPop=0
        for elePop in subelement.getiterator('population'):
            year = int(elePop.attrib['year'])
            pop = int(elePop.text)
            if (year>maxYear): 
                maxPop=pop
                maxYear=year
        cities.append({'city':name, 'year':maxYear, 'pop': maxPop })
        
cities = sorted(cities, key=lambda k: k['pop'], reverse=True)    

## Display Top 10 cities by population

In [94]:
cities[0:10]

[{'city': 'Shanghai', 'pop': 22315474, 'year': 2010},
 {'city': 'Istanbul', 'pop': 13710512, 'year': 2012},
 {'city': 'Mumbai', 'pop': 12442373, 'year': 2011},
 {'city': 'Moskva', 'pop': 11979529, 'year': 2013},
 {'city': 'Beijing', 'pop': 11716620, 'year': 2010},
 {'city': 'São Paulo', 'pop': 11152344, 'year': 2010},
 {'city': 'Tianjin', 'pop': 11090314, 'year': 2010},
 {'city': 'Guangzhou', 'pop': 11071424, 'year': 2010},
 {'city': 'Delhi', 'pop': 11034555, 'year': 2011},
 {'city': 'Shenzhen', 'pop': 10358381, 'year': 2010}]

## Ethinc Groups

In [108]:

countryPop=[];
eg={}
for element in document.iterfind('country'):
    name = element.find('name').text
    maxYear=0
    maxPop=0
    for elePop in element.findall('population'):
        year = int(elePop.attrib['year'])
        pop = int(elePop.text)
        if (year>maxYear): 
            maxPop=pop
            maxYear=year
    
    for elePop in element.findall('ethnicgroup'):
        pct = float(elePop.attrib['percentage'])
        egName = elePop.text
        pop = pct*maxPop
        if (egName not in eg): 
            eg[egName]=0;
        #print(egName)
        eg[egName]+=pop;
    
    countryPop.append({'country':name, 'year':maxYear, 'pop': maxPop })

egList=[];
for k in eg:
    egList.append({'ethnicgroup':k, 'population':eg[k]})
    
        
egList = sorted(egList, key=lambda k: k['population'], reverse=True) 

## Top 10 Ethnic Groups by Population

In [109]:
egList[0:10]

[{'ethnicgroup': 'Han Chinese', 'population': 124505880000.0},
 {'ethnicgroup': 'Indo-Aryan', 'population': 87181558344.0},
 {'ethnicgroup': 'European', 'population': 49487221971.96},
 {'ethnicgroup': 'African', 'population': 31832512036.9},
 {'ethnicgroup': 'Dravidian', 'population': 30271374425.0},
 {'ethnicgroup': 'Mestizo', 'population': 15773435493.7},
 {'ethnicgroup': 'Bengali', 'population': 14677691672.0},
 {'ethnicgroup': 'Russian', 'population': 13185699607.699999},
 {'ethnicgroup': 'Japanese', 'population': 12653421200.0},
 {'ethnicgroup': 'Malay', 'population': 12199355037.4}]

## Create country code lookup table

In [138]:
countryLookup={}
for element in document.iterfind('country'):
    #print (element.attrib['car_code'])
    #print (element.find('name').text)
    countryLookup[element.attrib['car_code']]=element.find('name').text
#countryLookup

## Find Longest River

In [149]:
riverCode=''
lakeCode=''
airportCode=''
longestRiver=''
largestLake=''
highestAirport=''
maxLength=0
maxArea=0;
maxElevation=0
for element in document.iterfind('river'):
    
    if (element.find('length')!=None): 
        length=float(element.find('length').text)
        if (length>maxLength):
            maxLength=length
            longestRiver=element.find('name').text
            riverCode=element.attrib['country']

riverCountries =''           
for k in riverCode.split():
    riverCountries+=countryLookup[k]+', '
print ('The longest river is the ' + str(longestRiver) + ' which is ' + str(maxLength) + 'km long. It is in ' + riverCountries[:-2] +'.')
   
    
    

#r=document.find('river')
#r=r.find('length').text
#r=r.find('name').text
#r=r.find('area').text
#r=r.find('elevation').text
#r=document.find('lake')
#print(r.attrib['country'])
#for child in r:
#    print(child.tag)
#    print(child.attrib)
#    print(child.text)

    

The longest river is the Amazonas which is 6448.0km long. It is in Colombia, Brazil, Peru.


In [151]:
for element in document.iterfind('lake'):
    
    if (element.find('area')!=None): 
        area=float(element.find('area').text)
        if (area>maxArea):
            maxArea=area
            largestLake=element.find('name').text
            lakeCode=element.attrib['country']

lakeCountries =''           
for k in lakeCode.split():
    lakeCountries+=countryLookup[k]+', '
print ('The largest lake is the ' + str(largestLake) + ' which is ' + str(maxArea) + 'km^2. It is in ' + lakeCountries[:-2] +'.')
   
 

The largest lake is the Caspian Sea which is 386400.0km^2. It is in Russia, Azerbaijan, Kazakhstan, Iran, Turkmenistan.


In [160]:
for element in document.iterfind('airport'):
    
    if (element.find('elevation')!=None): 
        #print(element.find('elevation'))
        if (element.find('elevation').text!=None):
            elevation=float(element.find('elevation').text)
            if (elevation>maxElevation):
                maxElevation=elevation
                highestAirport=element.find('name').text
                airportCode=element.attrib['country']

airportCountries =''           
for k in airportCode.split():
    airportCountries+=countryLookup[k]+', '
print ('The highest airport is ' + str(highestAirport) + ' which is ' + str(maxElevation) + 'm high. It is in ' + airportCountries[:-2] +'.')
   

The highest airport is El Alto Intl which is 4063.0m high. It is in Bolivia.
