# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [14]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [15]:
document_tree = ET.parse('Data_Wrangling/data_wrangling_xml/data/mondial_database_less.xml' )

In [16]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [17]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [42]:
from xml.etree import ElementTree as ET
import pandas as pd
import pprint

#convert xml structure to pandas dataframe using same method as with JSON
xml_data = open('Data_Wrangling/data_wrangling_xml/data/mondial_database.xml').read()
def xml2df(xml_data):
    root = ET.XML(xml_data) #element tree hence ET
    all_records = [] #record list to convert to df
    for i, child in enumerate(root): #loop through ET
        record = {} #place holder for record
        for subchild in child: #iterate through subchildren
            record[subchild.tag] = subchild.text #extract text to create key, value
            all_records.append(record) #append to all_records
    return pd.DataFrame(all_records) #return records

output_records = xml2df(xml_data)
# 10 countries with lowest infant mortality rates
target_infant_mortality = output_records.groupby('infant_mortality').min()
print target_infant_mortality.name.head(10) #gives lowest(min()) mortality rates ascending

# 10 cities with largest population
#got key error
#target_population =  output_records.groupby(['population year','population']).agg('max', 'max')
target_population = output_records.groupby('population').max()
print target_population.city.head(10)

#10 ethnic groups with largest overall populations(sum of best/latest estimates over all countries
output_records['ethnicpopulation'] = output_records.population * output_records['ethnicgroup percentage'] * .001

target_ethnicgroup =  output_records.groupby(['ethnicgroup'])['ethnicpopulation'].sum()
    
print target_ethnicgroup.head(10)

#different method doesn't work
#document = ET.parse('Data_Wrangling/data_wrangling_xml/data/mondial_database.xml' )
#root = document_tree.getroot()
#for max(country.infant_mortality), country.name in root.iter('country'):
#    print country.name, country.infant_mortality

infant_mortality
1.81                   Monaco
10.16                 Romania
10.2                     Fiji
10.48                  Brunei
10.5                  Grenada
10.59               Mauritius
10.7                   Panama
10.77              Seychelles
10.92    United Arab Emirates
10.93                Barbados
Name: name, dtype: object


KeyError: 'population year'

In [64]:
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET
from lxml import etree

document = ET.parse('Data_Wrangling/data_wrangling_xml/data/mondial_database.xml' )

#root = document.getroot()
soup = BeautifulSoup(open('Data_Wrangling/data_wrangling_xml/data/mondial_database.xml'), 
                          'r')
soup.find_all('name', 'infant_mortality')

#for infant_mortality, name in root.iter('country'):
 #   if infant_mortality == max(infant_mortality):
  #      print name, infant_mortality

FeatureNotFound: Couldn't find a tree builder with the features you requested: r. Do you need to install a parser library?

In [109]:
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET


output = {}

mondial_tree = ET.parse('Data_Wrangling/data_wrangling_xml/data/mondial_database.xml')
root = mondial_tree.getroot()

for country in root.findall('country'):
    name = country.find('name').text
    print name, country.find('infant_mortality')

Albania <Element 'infant_mortality' at 0x7f50bf8ae1d0>
Greece <Element 'infant_mortality' at 0x7f50c7a56790>
Macedonia <Element 'infant_mortality' at 0x7f50bdf9f290>
Serbia <Element 'infant_mortality' at 0x7f50c7d47910>
Montenegro None
Kosovo None
Andorra <Element 'infant_mortality' at 0x7f50c7a10d10>
France <Element 'infant_mortality' at 0x7f50c0976290>
Spain <Element 'infant_mortality' at 0x7f50c18ed990>
Austria <Element 'infant_mortality' at 0x7f50c9090590>
Czech Republic <Element 'infant_mortality' at 0x7f50bfeadad0>
Germany <Element 'infant_mortality' at 0x7f50c1d8c810>
Hungary <Element 'infant_mortality' at 0x7f50c19d94d0>
Italy <Element 'infant_mortality' at 0x7f50bf820ed0>
Liechtenstein <Element 'infant_mortality' at 0x7f50bd856ad0>
Slovakia <Element 'infant_mortality' at 0x7f50bd85f450>
Slovenia <Element 'infant_mortality' at 0x7f50bd87d650>
Switzerland <Element 'infant_mortality' at 0x7f50bfc50390>
Belarus <Element 'infant_mortality' at 0x7f50bd256390>
Latvia <Element 'infant