# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

In [2]:
import pandas as pd

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [3]:
document = ET.parse( './mondial_database.xml' )

In [4]:
root = document.getroot()

In [5]:
#Exercise 1: 10 countries with the lowest infant mortality rates

#initialize storage df
infant_mortality = pd.DataFrame()

#iterate elements in tree document, i.e. countries
for element in document.iterfind('country'):
    
    #initialize inf_mort text string for output of query
    inf_mort = ''
    
    #find appropriate subelements and appends to text string if exists
    for subelement in element.iter('infant_mortality'):
        
        #find the first (and only) instance of inf mort for each country
        inf_mort += element.find('infant_mortality').text
    
    #check in the case that no value was found and replace '' with 'NaN'
    if(inf_mort == ''):
        inf_mort = 'NaN'
    
    #convert inf_mort string to float
    inf_mort = float(inf_mort)
    #save found data to a temp df, and convert inf_mort string to a float
    temp = pd.DataFrame({'Country':[element.find('name').text],
                         'Infant_Mortality':[inf_mort]})
    #append temp df to infant_mortality df
    infant_mortality = infant_mortality.append(temp,ignore_index = True)

#print out 10 smallest results
print(infant_mortality.nsmallest(10,'Infant_Mortality'))

#as a bonus, print out the 10 largest results
print(infant_mortality.nlargest(10,'Infant_Mortality'))

            Country  Infant_Mortality
38           Monaco              1.81
98            Japan              2.13
36           Norway              2.48
117         Bermuda              2.48
36           Norway              2.48
117         Bermuda              2.48
106       Singapore              2.53
37           Sweden              2.60
10   Czech Republic              2.63
78        Hong Kong              2.73
79            Macao              3.13
44          Iceland              3.15
                      Country  Infant_Mortality
194            Western Sahara            145.82
54                Afghanistan            117.23
189                      Mali            104.34
226                   Somalia            100.14
213  Central African Republic             92.86
230             Guinea-Bissau             90.92
214                      Chad             90.30
192                     Niger             86.27
195                    Angola             79.99
201              Burkina F

In [6]:
#Exercise 2: 10 cities with the largest populations

# Quick check to see what population looks like for countries
# print names of all countries and all measured populations
for element in document.iterfind('country'):
    #print('* ' + element.find('name').text + ':')
    pop_num = []
    #pop_yrs = []
    for subelement in element.getiterator('population'):
        pop_num = element.findall('population')
    #    pop_yrs = element.findall('year')
    #for element in data_list
    print("* " + element.find('name').text)
    for element in pop_num:
        print(element.text," - ", element.attrib['year'])
    

* Albania
1214489  -  1950
1618829  -  1960
2138966  -  1970
2734776  -  1980
3446882  -  1990
3249136  -  1997
3304948  -  2000
3069275  -  2001
2800138  -  2011
* Greece
1096810  -  1861
1457894  -  1870
1679470  -  1879
2433806  -  1896
2631592  -  1907
5016889  -  1920
6204684  -  1928
7344860  -  1940
7632801  -  1951
8388553  -  1961
8768372  -  1971
9739589  -  1981
10217335  -  1991
10934097  -  2001
10816286  -  2011
* Macedonia
808724  -  1921
949958  -  1931
1152986  -  1948
1304514  -  1953
1406003  -  1961
1647308  -  1971
1909136  -  1981
2033964  -  1991
1935034  -  1994
2022547  -  2002
2059794  -  2011
* Serbia
6732256  -  1950
7657958  -  1960
8236185  -  1970
9057483  -  1980
9735429  -  1990
7620531  -  1991
7498001  -  2002
7120666  -  2011
* Montenegro
311341  -  1921
360044  -  1931
377189  -  1948
419873  -  1953
471894  -  1961
529604  -  1971
584310  -  1981
615035  -  1991
620145  -  2003
620029  -  2011
* Kosovo
1584440  -  1981
1956196  -  1991
1733872  -  

In [7]:
#function for extracting the latest measure of city population
#input is name of country and the subelement for city of interest
#code finds the city name, population, year of population and formats
#output is a df with country name, city name, population, population year

def pop_finder(country_name, inp_element):
    #for each city element as specified by 'inp_element'
    
    #find the city_name
    city_name    = inp_element.find('name').text
    
    #find the list of city populations
    city_pop_all = inp_element.findall('population')
    
    #initialize return variables 
    city_pop_last = ''
    city_pop_year = ''
    
    #check if no populations were recorded for a city
    if(len(city_pop_all) == 0):
        #if no records, change value to 'NaN'
        city_pop_last = float('NaN')
        city_pop_year = float('NaN')
    else:
        #otherwise use the last values in each list
        city_pop_last = int(city_pop_all[-1].text)
        city_pop_year = int(city_pop_all[-1].attrib['year'])
    
    #save export variables to 'temp' df
    temp = pd.DataFrame({'Country':[country_name],
                         'City_Name':[city_name],
                         'City_Pop':[city_pop_last],
                         'Pop_Year':[city_pop_year]})
    
    #return 'temp' df
    return temp

In [8]:
# use the 'pop_finder' function
# find and save names of all cities and all measured populations

#initialize empty storage dataframe
city_population = pd.DataFrame()

#iterate over elements in tree document, i.e. countries
for element in document.iterfind('country'):
    
    #reset indiviudal storage variables
    city_name = ''
    pop_num = []
    city_pop_all = []
    
    #some countries list cities as a direct sub-element
    for subelement in element.iterfind('city'):
        
        country_name = element.find('name').text
        new_row = pop_finder(country_name, subelement)
        city_population = city_population.append(new_row, ignore_index=True)
        
        
    #some countries are divided into "Provinces" before "Cities"
    for subelement in element.iterfind('./province/city'):
        #for subsubelement in subelement.iter('city'):
        country_name = element.find('name').text
        new_row = pop_finder(country_name, subelement)
        city_population = city_population.append(new_row, ignore_index=True)

#print the 10 largest cities in terms of population
print(city_population.nlargest(10,'City_Pop'))

      City_Name    City_Pop Country  Pop_Year
1341   Shanghai  22315474.0   China    2010.0
771    Istanbul  13710512.0  Turkey    2012.0
1527     Mumbai  12442373.0   India    2011.0
479      Moskva  11979529.0  Russia    2013.0
1340    Beijing  11716620.0   China    2010.0
2810  São Paulo  11152344.0  Brazil    2010.0
1342    Tianjin  11090314.0   China    2010.0
1064  Guangzhou  11071424.0   China    2010.0
1582      Delhi  11034555.0   India    2011.0
1067   Shenzhen  10358381.0   China    2010.0


In [9]:
#Exercise 3: 10 ethnic groups with the largest overall populations 
#(sum of best/latest estimates over all countries)

#initialize storage df
eth_groups = pd.DataFrame()

for element in document.iterfind('country'):

#return a list of all ethnic groups in each country
    eth_grp = element.findall('ethnicgroup')
#return a list of all population records in each country
    pop_num = element.findall('population')

    #iterate through each ethnic group
    for element in eth_grp:
        #calculate number of each group (total X %)
        eth_num = round(float(pop_num[-1].text)*float(element.attrib['percentage'])/100)

        #save to temp df
        temp = pd.DataFrame({'Group':[element.text],
                             'Number':[eth_num]})
        
        #append to storage df
        eth_groups = eth_groups.append(temp,ignore_index = True)

#print the 10 largest "ethnic groups" by population
#grouping by name, sorting by sum within each group
print(eth_groups.groupby('Group').sum().nlargest(10,'Number'))

                 Number
Group                  
Han Chinese  1245058800
Indo-Aryan    871815583
European      494872221
African       318325121
Dravidian     302713744
Mestizo       157734355
Bengali       146776917
Russian       131856994
Japanese      126534212
Malay         121993550


In [61]:
#Exercise 4: Name and country of:
#a. Longest River
#b. Largest Lake
#c. Highest Airport (elevation)

In [87]:
#First find way of getting all country codes and country names
#
#function defines a country code to name translator
#uses single string of country code (multiple codes per string are ok)
#returns a 
def country_codename(ctry_code, codename_df, sep=' '):
    
    #initialize blank output string
    name_output = ''
    
    #intitialize empty code and name lists
    code_list = []
    
    #check if multiple country codes
    if sep in ctry_code:
        code_list = str.split(ctry_code,sep)
    else:
        #make into a list anyway
        code_list = [ctry_code]
    
    
    for c in range(0,len(code_list)):
        name_output += codename_df[codename_df.Code==code_list[c]].Name.iloc[0]
        if(c < (len(code_list)-1)):
            name_output += '/'
    
    return name_output
            

In [88]:
#function to find a designated parameter of a designated feature
#returns entire dataframe of results w country names (unless otherwise supressed)
#assumes parameter is a subelement and is numeric (can be supressed)

def parameter_finder(feature, parameter, source_doc, ctry_info=True, feat_numeric=True):

    #finding all country codes listed in the document if this info is requested by 'ctry_info'
    #default is True
    if ctry_info:
        ctry_code_name_df = pd.DataFrame()
        for element in source_doc.iterfind('country'):
            country_name = element.find('name').text
            country_code = element.attrib['car_code']
    
            temp = pd.DataFrame({'Name':[country_name],
                                 'Code':[country_code]})
        
            ctry_code_name_df = ctry_code_name_df.append(temp, ignore_index = True)

    
    
    export_dataframe = pd.DataFrame()
    
    for element in source_doc.iterfind(feature):
        
        feat_name = element.find('name').text
        feat_param = ''
        ctry_code = element.attrib['country']
        
        for subelement in element.iter(parameter):
            if(element.find(parameter).text != None):
                feat_param += element.find(parameter).text
            else:
                feat_param = ''
        
        if feat_param == '':
            feat_param = 'NaN'

        if feat_numeric:
            feat_param = float(feat_param)
        
        feat_name_header = feature + ' ' + 'name'
        feat_param_header = feature + ' ' + parameter
        
        if(ctry_info):
            
            ctry_name = country_codename(ctry_code,ctry_code_name_df)
            
            temp = pd.DataFrame({feat_name_header:[feat_name],
                                 feat_param_header:[feat_param],
                                 'Country':[ctry_name]})
        else:
            temp = pd.DataFrame({feat_name_header:[feat_name],
                                 feat_param_header:[float(feat_param)]})
            
        export_dataframe = export_dataframe.append(temp, ignore_index = True)
    
    return export_dataframe

In [89]:
#4a Longest River
river_lengths = parameter_finder('river','length',document)
print(river_lengths.nlargest(1,'river length'))

                  Country  river length river name
174  Colombia/Brazil/Peru        6448.0   Amazonas


In [90]:
#4b Largest Lake (apparently it includes 'Seas' as 'Lakes')
lake_areas = parameter_finder('lake','area',document)
print(lake_areas.nlargest(1,'lake area'))

                                           Country  lake area    lake name
54  Russia/Azerbaijan/Kazakhstan/Iran/Turkmenistan   386400.0  Caspian Sea


In [91]:
#4c HIghest Airport Elevation
airport_elev = parameter_finder('airport','elevation',document)
print(airport_elev.nlargest(1,'airport elevation'))

    Country  airport elevation  airport name
80  Bolivia             4063.0  El Alto Intl
