In [1]:
from xml.etree import ElementTree as ET

In [2]:
document_tree = ET.parse( '/Users/jessica/Documents/data_wrangling_xml/data/mondial_database.xml' )
document_tree

<xml.etree.ElementTree.ElementTree at 0x10c39cf50>

In [3]:
#Find 10 countries with the lowest infant mortality rates
#Build an empty dictionary
import pandas as pd
answer1_dict = {}
#Iterate through each country element of the elment tree. Find the infant mortality rate subelement and assign it a variable
#If that variable isn't none, build a dictionary with key = name.text subelement of country and value = infant mortality rate.text subelement of country 
for country in document_tree.iterfind('country'):
    inf_mo = country.find('infant_mortality')
    if inf_mo is not None:
        answer1_dict[country.find('name').text] = float(inf_mo.text)
#Build a dataframe of the for loop tuples and name the headers
df = pd.DataFrame(answer1_dict.items(), columns=['country', 'infant_mortality_rate'])
#Sort to show the lowest 10 infant mortality rates
df.sort_values('infant_mortality_rate').head(10)

Unnamed: 0,country,infant_mortality_rate
34,Monaco,1.81
210,Japan,2.13
71,Norway,2.48
64,Bermuda,2.48
76,Singapore,2.53
106,Sweden,2.6
55,Czech Republic,2.63
143,Hong Kong,2.73
52,Macao,3.13
189,Iceland,3.15


In [4]:
#10 cities with the largest population
#Create blank list
list = []
#Find all the cities, for each assign the name of the city and the 2011 population to variables and each time appent those variables to the list
for city in document_tree.findall('.//city'):
    name = city.find('name')
    if name is not None:
        name=name.text
    #Need to go back and figure out how to insert the last year listed
    population = city.find('population[@year="2011"]')
    if population is not None:
        population=int(population.text)
    list.append([city.attrib['id'],name, population])
#Convert the list to a dataframe and give it column headers, then sort descending
df = pd.DataFrame(list, columns=['id', 'name', 'population_2011'])
df.sort_values('population_2011', ascending=False).head(10)

Unnamed: 0,id,name,population_2011
1527,cty-India-2,Mumbai,12442373.0
1582,cty-India-New-Delhi,Delhi,11034555.0
1515,cty-India-Bangalore,Bangalore,8443675.0
1000,cty-United-Kingdom-2,London,8250205.0
1382,cty-Iran-Tehran,Tehran,8154051.0
1470,cty-BD-3,Dhaka,7423137.0
1591,cty-India-7,Hyderabad,6731790.0
1505,cty-India-8,Ahmadabad,5577940.0
3056,cty-Angola-Luanda,Luanda,5000000.0
1556,cty-India-Madras,Chennai,4646732.0


In [8]:
import numpy as np
#10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
ethnicgroups = dict()
#Start with looking at each each country object
for country in document_tree.iterfind('country'):
#Create an empty list and iterate through each country's populations adding the year and the population number 
    populations = []
    for population in country.findall('population'):
        populations.append((int(population.get('year')), int(population.text)))
#From the list take the max population
        max_population = max(populations)[1]
#Iterate through the ethnic groups and multiply the ethnic group's % by the max population to get an absolute ethnic group poulation
    for ethnicgroup in country.findall('ethnicgroup'):
        population_ethnic_group = (float(ethnicgroup.get('percentage')) * max_population / 100)
        ethnicgroups[ethnicgroup.text] = ethnicgroups.setdefault(ethnicgroup.text, 0) + population_ethnic_group
#Convert this list of ethnic group populations into a dataframe
df = pd.DataFrame.from_dict(ethnicgroups, orient='index')
pd.options.display.float_format = '{:20,.2f}'.format
#Sort by the ethnic group populations to get top 10
sorted_df = df.sort_values([0],ascending=False).head(10)
print(sorted_df)

                               0
Han Chinese     1,245,058,800.00
Indo-Aryan        871,815,583.44
European          494,872,219.72
African           318,325,120.37
Dravidian         302,713,744.25
Mestizo           157,734,354.94
Bengali           146,776,916.72
Russian           131,856,996.08
Japanese          126,534,212.00
Malay             121,993,550.37


In [12]:
#Name and country of a) longest river, b) largest lake and c) airport at highest elevation
country_dict = {}
#Extract each country name
for country in document_tree.iterfind('country'):
    country_dict[country.get('car_code')] = country.find('name').text
rivers = []
#Iterate through the rivers and get the lengths
for river in document_tree.iterfind('river'):
    for country in river.get('country').split():
        length = river.find('length')
        if length is None:
            length = np.nan
        else:
            length = float(length.text)
#Add to list the river name, length, and country)
        rivers.append([river.find('name').text, length, country_dict[country]])
#Convert list to dataframe
river_df = pd.DataFrame(rivers, columns=['name', 'length', 'country'])

#Iterate through lakes, get lake areas
lakes = []
for lake in document_tree.iterfind('lake'):
    for country in lake.get('country').split():
        area = lake.find('area')
        if area is None:
            area = np.nan
        else:
            area = float(area.text)
#Build list of lakes and their areas and countries
        lakes.append([lake.find('name').text, area, country_dict[country]])
lake_df = pd.DataFrame(lakes, columns=['name', 'area', 'country'])

#Iterate through airports, get elevation
airports = []
for airport in document_tree.iterfind('airport'):
    for country in airport.get('country').split():
        elevation = airport.find('elevation')
        if elevation is None or elevation.text is None:
            continue
        else:
            elevation = float(elevation.text)
#Build list of airports and their elevation and country
        airports.append([airport.find('name').text, elevation, country_dict[country]])
airport_df = pd.DataFrame(airports, columns=['name', 'elevation', 'country'])

#take the row with max values of river dataframe, store that row as max_river
max_river = river_df.loc[river_df['length'].idxmax()]
print "Longest River:\n\t Name: {} Country: {} Length: {}".format(max_river['name'], max_river['country'], max_river['length'])

#take the row with max areas of lake dataframe
max_lake = lake_df.loc[lake_df['area'].idxmax()]
print "Largest Lake:\n\t Name: {} Country: {} Area: {}".format(max_lake['name'], max_lake['country'], max_lake['area'])

#take row with max elevations of airport dataframe
max_airport = airport_df.loc[airport_df['elevation'].idxmax()]
print "Highest airport:\n\t Name: {} Country: {} Elevation: {}".format(max_airport['name'], max_airport['country'], max_airport['elevation'])



Longest River:
	 Name: Amazonas Country: Colombia Length: 6448.0
Largest Lake:
	 Name: Caspian Sea Country: Russia Area: 386400.0
Highest airport:
	 Name: El Alto Intl Country: Bolivia Elevation: 4063.0
