# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [137]:
import numpy as np
import pandas as pd
document = ET.parse( './data/mondial_database.xml' )
#Make some empty lists to store things
country = []
inf_mort = []
city = []
city_pop = []
for element in document.iterfind('country'):                            #Iterate through the countries
    country.append(element.find('name').text)                           #Store each name
    if element.find('infant_mortality') is not None:
         inf_mort.append(element.find('infant_mortality').text)         #Store each infant mortality
    else:
        inf_mort.append(np.nan)
    for subelement in element.getiterator('city'):                      #Iterate through the cities
        city.append(subelement.find('name').text)                       #Store the city name
        if len(subelement.findall('population')):                       #Store the city population from the latest entry
            city_pop.append(subelement.findall('population')[-1].text)
        else:
            city_pop.append(np.nan)
 
        

In [96]:
#Question 1: 10 countries with lowest infant mortality rates
data = {'country': country, 'inf_mort': inf_mort}               #Make a data dict
mortality = pd.DataFrame(data)                                  #Turn the dict into a dataframe
mortality['inf_mort'] = mortality['inf_mort'].astype('float16') #Change inf_mortality from 'object' to a float
mortality.sort('inf_mort').head(10)                             #Sort by inf_mortalilty and return the top ten

Unnamed: 0,country,inf_mort
38,Monaco,1.80957
98,Japan,2.130859
117,Bermuda,2.480469
36,Norway,2.480469
106,Singapore,2.529297
37,Sweden,2.599609
10,Czech Republic,2.630859
78,Hong Kong,2.730469
79,Macao,3.130859
44,Iceland,3.150391


In [146]:
#Question 2: 10 cities with the largest population
data = {'city': city, 'city_pop': city_pop}                     #Make a data dict
pop = pd.DataFrame(data)                                        #Turn the dict into a dataframe
pop['city_pop'] = pop['city_pop'].astype('float32')             #Change city_pop from 'object' to a float
pop.sort('city_pop', ascending = False).head(10)                #Sort by city_pop in descending order and return the top ten

Unnamed: 0,city,city_pop
1341,Shanghai,22315474
771,Istanbul,13710512
1527,Mumbai,12442373
479,Moskva,11979529
1340,Beijing,11716620
2810,São Paulo,11152344
1342,Tianjin,11090314
1064,Guangzhou,11071424
1582,Delhi,11034555
1067,Shenzhen,10358381


In [179]:
#Question 3: 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
eth_dict = {}
#Populate an ethnicity dictionary with groups as names and populations as values
for element in document.iterfind('country'):               #For each country
    pop = float(element.findall('population')[-1].text)    #Get the population of the country
    for group in element.findall('ethnicgroup'):           #For each ethnic group
        percent = float(group.attrib['percentage']) / 100  #Get the percent of the poulation of that group
        name = group.text                                  #Get the name of the group
        group_pop = pop * percent                          #Get the population of the group
        if name in eth_dict:                               #Add the population to the dictionary
            eth_dict[name] = eth_dict[name] + group_pop
        else:
            eth_dict[name] = group_pop

eth_pop = pd.Series(eth_dict).order(ascending = False)    #Convert the dictionary to a series and sort it in descending order
eth_pop[0:10]                                             #Return the first ten elements

Han Chinese    1.245059e+09
Indo-Aryan     8.718156e+08
European       4.948722e+08
African        3.183251e+08
Dravidian      3.027137e+08
Mestizo        1.577344e+08
Bengali        1.467769e+08
Russian        1.318570e+08
Japanese       1.265342e+08
Malay          1.219936e+08
dtype: float64

In [181]:
#Question 4: name and country of a) longest river, b) largest lake and c) airport at highest elevation
#Make a country code dictionary with country code keys and country name values
country_code = {}

for element in document.iterfind('country'):
    country_code[element.attrib['car_code']] = element.find('name').text 

In [198]:
#a) longest river
name = []
country = []
length = []

for element in document.iterfind('river'):                   #Iterate through rivers
    name.append(element.find('name').text)                   #Store the name
    countries = element.attrib['country']                    #Get the country codes string
    country_str = ''                                         #Start a string for the country
    for word in countries.split():                           #Go through the country codes
        country_str += country_code[word] + ', '             #Add to the country string
    country.append(country_str[:-2])                         #Store the country string
    if element.find('length') is not None:                   #Store the length
        length.append(float(element.find('length').text))
    else:
        length.append(np.nan)

data = {'name': name, 'countries': country, 'length': length} #Make a data dict
rivers = pd.DataFrame(data)                                   #Convert to a dataframe
rivers.sort('length', ascending = False).head(1)              #Sort and get the longest river

Unnamed: 0,countries,length,name
174,"Colombia, Brazil, Peru",6448,Amazonas


In [200]:
#b) largest lake
name = []
country = []
area = []

for element in document.iterfind('lake'):                    #Iterate through the lakes
    name.append(element.find('name').text)                   #Store the name
    countries = element.attrib['country']                    #Get the country codes string
    country_str = ''                                         #Start a string for the country
    for word in countries.split():                           #Go through the country codes
        country_str += country_code[word] + ', '             #Add to the country string
    country.append(country_str[:-2])                         #Store the country string
    if element.find('area') is not None:                     #Store the area
        area.append(float(element.find('area').text))
    else:
        area.append(np.nan)

data = {'name': name, 'countries': country, 'area': area}     #Make a data dict
lakes = pd.DataFrame(data)                                    #Convert to a dataframe
lakes.sort('area', ascending = False).head(1)                 #Sort and get the largest lake

Unnamed: 0,area,countries,name
54,386400,"Russia, Azerbaijan, Kazakhstan, Iran, Turkmeni...",Caspian Sea


In [203]:
#c) airport at highest elevation
name = []
country = []
elevation = []

for element in document.iterfind('airport'):                 #Iterate through the airports
    name.append(element.find('name').text)                   #Store the name
    code = element.attrib['country']                         #Get the country code
    country.append(country_code[code])                       #Store the country 
    if element.find('elevation') is not None:                #Store the elevation
        if element.find('elevation').text is not None:
            elevation.append(float(element.find('elevation').text))
        else:
            elevation.append(np.nan)
    else:
        elevation.append(np.nan)

data = {'name': name, 'country': country, 'elevation': elevation} #Make a data dict
airports = pd.DataFrame(data)                                     #Convert to a dataframe
airports.sort('elevation', ascending = False).head(1)             #Sort and get the largest lake

Unnamed: 0,country,elevation,name
80,Bolivia,4063,El Alto Intl
