# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [22]:
from xml.etree import ElementTree as ET
import xml
import pandas as pd

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [9]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [10]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [11]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':')
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [7]:
document = ET.parse( './data/mondial_database.xml' )

## 1.

In [98]:
#creating output of all countries with infant mortality rate
count = 0
for element in document.iterfind('country'):
    if((element.find('infant_mortality') != None) & (count < 10)):
        print(element.find('name').text, element.find('infant_mortality').text)
        count = count + 1

Albania 13.19
Greece 4.78
Macedonia 7.9
Serbia 6.16
Andorra 3.69
France 3.31
Spain 3.33
Austria 4.16
Czech Republic 2.63
Germany 3.46


In [143]:
#puts all elements into a list to import into DataFrame, then sort by infant_mortality
infmorlist = []
for element in document.iterfind('country'):
    if(element.find('infant_mortality') != None):
        infmorlist.append([element.find('name').text,float(element.find('infant_mortality').text)])

df = pd.DataFrame(infmorlist, columns = ['country','infant_mortality'])
df = df.sort_values('infant_mortality')
df.head(10)

Unnamed: 0,country,infant_mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


## 2. 

In [91]:
# best way to do this? can you get an element's value at its own level?
cityList = []
for element in document.iterfind('country'):
    for city in element.getiterator('city'):
        if(city.findall('population') != []):
            cityList.append([city.find('name').text,int(city.findall('population')[-1].text)])
        #for population in city.iterfind('population'):
            #if(population.attrib['year'] == '2011'):
            #    cityList.append([city.find('name').text,city.find(population).text])
cityList

citydf = pd.DataFrame(cityList, columns = ['city','latestPop']).sort_values('latestPop',ascending = False)
citydf.head(10)
        
        
            #if(subsubelement.get('year').text == '2011'):
                #cityList.append([subelement.get('name').text,subelement.get()])

Unnamed: 0,city,latestPop
1251,Shanghai,22315474
707,Istanbul,13710512
1421,Mumbai,12442373
443,Moskva,11979529
1250,Beijing,11716620
2594,São Paulo,11152344
1252,Tianjin,11090314
974,Guangzhou,11071424
1467,Delhi,11034555
977,Shenzhen,10358381


## 3.

In [152]:
#create a dictionary for each ethnicity and update value, then turn dictionary into DataFrame and sort
ethdict = {}
for element in document.iterfind('country'):
    for ethnicgroup in element.getiterator('ethnicgroup'):
        if (ethnicgroup.text in ethdict):
            ethdict[ethnicgroup.text] = ethdict[ethnicgroup.text] + float(ethnicgroup.get('percentage'))/100 * int(element.findall('population')[-1].text)
        elif(element.findall('population') != []):
            ethdict[ethnicgroup.text] = float(ethnicgroup.get('percentage'))/100 * int(element.findall('population')[-1].text)


ethdf = pd.DataFrame.from_dict({'ethnic group': list(ethdict.keys()), 'population': list(ethdict.values())})
ethdf['population'] = ethdf['population'].astype(int)
ethdf.sort_values('population', ascending = False).head(10)

Unnamed: 0,ethnic group,population
80,Han Chinese,1245058800
106,Indo-Aryan,871815583
128,European,494872219
16,African,318325120
105,Dravidian,302713744
150,Mestizo,157734354
98,Bengali,146776916
33,Russian,131856996
139,Japanese,126534212
110,Malay,121993550


## 4.

In [183]:
longestRiver = {}
largestLake = {}
highestAirport = {}

def findMost(objOfInt, characteristic):
    dict = {}
    for element in document.iterfind(objOfInt):
        #print(element.find('name').text)
        if(len(dict) == 0):
            dict['name'] = element.find('name').text
            dict['countries'] = element.get('country')
            dict[characteristic] = int(element.find(characteristic).text)
        elif(element.find(characteristic) != None):
            if(element.find(characteristic).text != None):
                if(float(element.find(characteristic).text) > dict[characteristic]):
                    dict['name'] = element.find('name').text
                    dict['countries'] = element.get('country')
                    dict[characteristic] = int(element.find(characteristic).text)
    return dict

longestRiver = findMost('river', 'length')
highestAirport = findMost('airport', 'elevation')
largestLake = findMost('lake', 'area')

print(longestRiver)
print(highestAirport)
print(largestLake)
            


{'name': 'Amazonas', 'countries': 'CO BR PE', 'length': 6448}
{'name': 'El Alto Intl', 'countries': 'BOL', 'elevation': 4063}
{'name': 'Caspian Sea', 'countries': 'R AZ KAZ IR TM', 'area': 386400}
