# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

SyntaxError: invalid syntax (<ipython-input-4-71a7702f86c3>, line 3)

In [None]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [None]:
document = ET.parse( './data/mondial_database.xml' )

In [36]:
import pandas as pd
countryname = ""
mortalityrate = ""
document = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['Country_name','Infant_mortality']) #create data frame to hold country name and its mortality rate

#loop through country elements to find country name and its mortality rate
for country in document.findall( 'country' ):
    for node in country.getiterator(): 
        if node.tag == 'name': #find country name tag
            if countryname == "":
                countryname = node.text
        if node.tag == 'infant_mortality': #find mortality rate tag
            mortalityrate = float(node.text)
    df.loc[len(df)] = [countryname,mortalityrate] #add country name and mortality rate to data frame
    countryname = ""
    mortalityrate = "1000"

df['Infant_mortality']=df['Infant_mortality'].astype(float)
#sort data frame and find top 10 countries with lowest mortalitiy rates
df.sort_values(by='Infant_mortality',ascending=True).head(10)

Unnamed: 0,Country_name,Infant_mortality
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [43]:
#10 cities with the largest population
import pandas as pd
cityname = ""
citypopulation = 0
document = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['CityName','Population']) #create data frame to hold country name and its popuplation

#loop through country element to find city name and its population
for country in document.iterfind( 'country' ):
    for city in country.iter('city'): #find all cities within each country element
        cityname = city.find('name').text
        year = int(0)
        for node in city.iterfind('population'): #find all population elements with each city
            year = node.attrib['year'] #there are multiple population elements with different 'year' attribute
            if node.attrib['year'] >= year: #store the population number of the latest year
                citypopulation = node.text
        df.loc[len(df)] = [cityname,citypopulation] #add city name and its population to data frame
        cityname = ""
        citypopulation = 0
df['Population']=df['Population'].astype(int)
#sort data frame to find 10 cities with largest population
df.sort_values('Population', ascending=False).head(10)

Unnamed: 0,CityName,Population
1341,Shanghai,22315474
771,Istanbul,13710512
1527,Mumbai,12442373
479,Moskva,11979529
1340,Beijing,11716620
2810,São Paulo,11152344
1342,Tianjin,11090314
1064,Guangzhou,11071424
1582,Delhi,11034555
1067,Shenzhen,10358381


In [None]:
# 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [47]:
import pandas as pd
import numpy as num
countryname = ""
countrypopulation = 0
ethnicpopulation = 0
document = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['Country','EthnicGroup','Population'])

#loop through country element to find ethnic groups and its population
for country in document.iterfind( 'country' ):
    countryname = country.find('name').text #find country name
    year = int(0)
    countrycpopulation = 0
    for node in country.iterfind('population'): #find population of the country
        year = node.attrib['year']
        if node.attrib['year'] >= year: #find population of the latest year
            countrypopulation = node.text
    ethnicname = None
    ethnicpopulation = 0
    for ethnic in country.iter('ethnicgroup'): #find all ethnic groups within the same country
        ethnicname = ethnic.text
        #compute each ethnic population: country population * ethnic group percentage
        ethnicpopulation = round(float(ethnic.attrib['percentage']) * 0.01 * int(countrypopulation))
        if ethnicname == None:
            ethnicname = countryname
            ethnicpopulation = countrypopulation
        df.loc[len(df)] = [countryname,ethnicname,ethnicpopulation] #store ethnic group population to data frame
    countryname = ""

#group ethnic group across all countries and sum them up to find top 10 ethnic groups and its total population
df.groupby('EthnicGroup').sum().sort('Population', ascending=False).head(10)



Unnamed: 0_level_0,Population
EthnicGroup,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


In [None]:
#name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [48]:
import pandas as pd
from xml.etree import ElementTree as ET
document = ET.parse( './data/mondial_database.xml' )

#loop through river element to river name, its country code(within attribute 'source') and its length
dfriver = pd.DataFrame(columns=['RiverName','Country','Length'])
for river in document.iter('river'):
    for riverlength in river.iterfind('length'):
        riverlength = float(riverlength.text)
#####????? why there is an error for a = river.find('length').text >>> AttributeError: 'NoneType' object has no attribute 'attrib'
    dfriver.loc[len(dfriver)] = [river.find('name').text, river.find('source').attrib['country'],riverlength]

#loop through lake element to find its name, country code and its size
dflake = pd.DataFrame(columns=['LakeName','Country','Area'])
for lake in document.iter('lake'):
    for lakearea in lake.iterfind('area'):
        lakearea = float(lakearea.text)
    dflake.loc[len(dflake)] = [lake.find('name').text, lake.attrib['country'],lakearea]

#loop through airport element to find its name, country code and elevation
dfair = pd.DataFrame(columns=['Airport','Country','Elevation'])
for airport in document.iter('airport'):
    for airporteleva in airport.iterfind('elevation'):
        airporteleva = airporteleva.text
        if airporteleva == None:
            airporteleva = 0
    dfair.loc[len(dfair)] = [airport.find('name').text, airport.attrib['country'],float(airporteleva)]

#create a data frame with country code and its corresponding country name
dfcountry = pd.DataFrame(columns=['Country','countryname'])
for country in document.iterfind('country'):
    dfcountry.loc[len(dfcountry)] = [country.attrib['car_code'],country.find('name').text]
#merge country name data drame with river, lake and airport dataframe to associate country code and country name on the same row
dfriver = pd.merge(dfriver,dfcountry,on='Country')
dflake = pd.merge(dflake,dfcountry,on='Country')
dfair = pd.merge(dfair,dfcountry,on='Country')

#sort to find the top 10 longest river, largest lake and highest airport
dfriver.sort('Length',ascending=False).head(10)
dflake.sort('Area',ascending=False).head(10)
dfair.sort('Elevation',ascending=False).head(10)



Unnamed: 0,Airport,Country,Elevation,countryname
80,El Alto Intl,BOL,4063.0,Bolivia
219,Lhasa-Gonggar,CN,4005.0,China
241,Yushu Batang,CN,3963.0,China
813,Juliaca,PE,3827.0,Peru
815,Teniente Alejandro Velasco Astete Intl,PE,3311.0,Peru
82,Juana Azurduy De Padilla,BOL,2905.0,Bolivia
334,Mariscal Sucre Intl,EC,2813.0,Ecuador
805,Coronel Fap Alfredo Mendivil Duarte,PE,2719.0,Peru
807,Mayor General FAP Armando Revoredo Iglesias Ai...,PE,2677.0,Peru
692,Licenciado Adolfo Lopez Mateos Intl,MEX,2581.0,Mexico


  if __name__ == '__main__':


TypeError: unorderable types: float() > str()