# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET
import operator
import pandas as pd
%matplotlib inline

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )

In [6]:
for childs in document_tree.getroot():
    print (childs.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


# 10 countries with the lowest infant mortality rates

In [7]:
infant_data = {}
for country in document.iterfind('country'):
    if country.find('infant_mortality') is None:
        pass
    else: 
        name = country.find('name').text
        im = country.find('infant_mortality').text
        infant_data[name] = float(im)
df = pd.DataFrame()
df['Country'] = infant_data.keys()
df['Rate'] = infant_data.values()
#print (sorted_dict[:10])
df = df.sort_values(['Rate'], ascending=[True])
df[:10]

Unnamed: 0,Country,Rate
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


# 10 cities with the largest population

In [8]:
cityname = ""
citypopulation = ""
df2 = pd.DataFrame(columns=['cityname', 'Population'])
df2['Population'] = df2['Population'].astype(float)
for country in document.iterfind('country'):
    for city in country.iter('city'): 
        cityname = city.find('name').text
        year = int(0)
        for node in city.iterfind('population'):
            year = node.attrib['year'] 
            if node.attrib['year'] >= year:
                citypopulation = int(node.text)
        df2.loc[len(df2)] = [cityname,citypopulation]
        cityname = ""

df2 = df2.sort_values(['Population'], ascending=False)
df2[:10]

Unnamed: 0,cityname,Population
1341,Shanghai,22315474.0
771,Istanbul,13710512.0
1527,Mumbai,12442373.0
479,Moskva,11979529.0
1340,Beijing,11716620.0
2810,São Paulo,11152344.0
1342,Tianjin,11090314.0
1064,Guangzhou,11071424.0
1582,Delhi,11034555.0
1067,Shenzhen,10358381.0


# 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [9]:
ethinicdata = {}
for country in document.iterfind('country'):
    if country.find('./ethnicgroup[1][@percentage]') is None:
        pass
    else:
        lastpop = int(country.find('./population[last()]').text)
        ethnic = country.find('./ethnicgroup[1]')
        ethnicname = ethnic.text
        ethicperc = float(ethnic.get('percentage'))/100
        ethinicdata[ethnicname] = lastpop * ethicperc

ethinic_df = pd.DataFrame()
ethinic_df['Ethinicity'] = ethinicdata.keys()
ethinic_df['Population'] = ethinicdata.values()
ethinic_df['Population'] = ethinic_df['Population'].astype(float)
ethinic_df = ethinic_df.sort_values(['Population'], ascending=[False])
ethinic_df[:10]


Unnamed: 0,Ethinicity,Population
39,Han Chinese,1245059000.0
48,Dravidian,302713700.0
46,Bengali,146776900.0
61,Japanese,126534200.0
18,Russian,114646200.0
59,Javanese,113456000.0
53,Viet/Kinh,76078380.0
26,Turkish,63935390.0
37,English,53592330.0
6,Mediterranean Nordic,46815920.0


# name and country of  a) longest river

In [10]:
riverdata = {}
for river in document.iterfind('river'):
    name = river.get('id')
    country = river.get('country')
    length = river.find('./length')
    if length is None:
        pass
    else:
        #print(float(length.text))
        riverdata[name, country]=float(length.text)
    
sorted_d = sorted(riverdata.items(), key=operator.itemgetter(1), reverse=True)
sorted_d[:1]

[(('river-Amazonas', 'CO BR PE'), 6448.0)]

# b) largest lake

In [11]:
lakedata = {}
for lake in document.iterfind('lake'):
    name = lake.get('id')
    country = lake.get('country')
    area = lake.find('./area')
    if area is None:
        pass
    else:
        #print float(area.text)
        lakedata[name, country]=float(area.text)

sorted_d = sorted(lakedata.items(), key=operator.itemgetter(1), reverse=True)
sorted_d[:1]


[(('lake-KaspischesMeer', 'R AZ KAZ IR TM'), 386400.0)]

# c) airport at highest elevation

In [12]:
airportdata = {}
for airport in document.iterfind('airport'):
    name = airport.get('iatacode')
    country = airport.get('country')
    elevation = airport.findtext('./elevation')
    if elevation is None:
        pass
    elif elevation=='':
        pass
    else:
        #print(float(elevation))
        airportdata[name, country]=float(elevation)

sorted_d = sorted(airportdata.items(), key=operator.itemgetter(1), reverse=True)
sorted_d[:1]    


[(('LPB', 'BOL'), 4063.0)]