# XML Mini-Project

In [1]:
# import ElementTree as ET
# import pandas as pd
# import numpy as np

from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

In [2]:
# read in xml file using ET.parse()

document_tree = ET.parse('./data/mondial_database.xml')
root = document_tree.getroot()

In [3]:
# examine all elements in one root element of the tree

list(root[1].iter())

[<Element 'country' at 0x9773b38>,
 <Element 'name' at 0x9773ba8>,
 <Element 'localname' at 0x9773be0>,
 <Element 'population' at 0x9773c18>,
 <Element 'population' at 0x9773c50>,
 <Element 'population' at 0x9773c88>,
 <Element 'population' at 0x9773cc0>,
 <Element 'population' at 0x9773cf8>,
 <Element 'population' at 0x9773d30>,
 <Element 'population' at 0x9773d68>,
 <Element 'population' at 0x9773da0>,
 <Element 'population' at 0x9773dd8>,
 <Element 'population' at 0x9773e10>,
 <Element 'population' at 0x9773e48>,
 <Element 'population' at 0x9773e80>,
 <Element 'population' at 0x9773eb8>,
 <Element 'population' at 0x9773ef0>,
 <Element 'population' at 0x9773f28>,
 <Element 'population_growth' at 0x9773f60>,
 <Element 'infant_mortality' at 0x9773f98>,
 <Element 'gdp_total' at 0x9773fd0>,
 <Element 'gdp_agri' at 0x9926048>,
 <Element 'gdp_ind' at 0x9926080>,
 <Element 'gdp_serv' at 0x99260b8>,
 <Element 'inflation' at 0x99260f0>,
 <Element 'unemployment' at 0x9926128>,
 <Element 'indep

## 1. 10 countries with the lowest infant mortality rates

In [4]:
# create dictionary for country:infant_mortality pairs
infant_mortality = {}

# search for 'infant_mortality' tag. if found, add country 'name' and 'infant_mortality' value to dictionary
for child in document_tree.getroot():
    if child.find('infant_mortality') != None:
        infant_mortality[child.find('name').text] = float(child.find('infant_mortality').text)

# import operator and use to slice list of lowest values from infant_mortality
import operator        
lowest_im = sorted(infant_mortality.items(), key=operator.itemgetter(1), reverse=False)[:10]

# create dataframe for countries with 10 lowest infant mortalities
df_lowest_im = pd.DataFrame(lowest_im)
df_lowest_im.columns = ['country', 'infant_mortality']
df_lowest_im


Unnamed: 0,country,infant_mortality
0,Monaco,1.81
1,Japan,2.13
2,Bermuda,2.48
3,Norway,2.48
4,Singapore,2.53
5,Sweden,2.6
6,Czech Republic,2.63
7,Hong Kong,2.73
8,Macao,3.13
9,Iceland,3.15


## 2. 10 cities with the largest population

In [16]:
# create dictionary for city:population pairs
population = {}

# search for subelement 'city' under every 'country' tag. If found, add city and population from the latest year to dictionary
for element in document_tree.iterfind('country'):
    for subelement in element.getiterator('city'):
        if subelement.find("population[last()]") != None:
            population[subelement.find('name').text] = float(subelement.find("population[last()]").text)

# import Counter and use to create dictionary of 10 highest populations
from collections import Counter           
largest_pop_dict = dict(Counter(population).most_common(10))

# create dataframe for cities with largest populations
largest_pop = pd.DataFrame.from_dict(largest_pop_dict, orient='index')
pd.options.display.float_format = '{:20,.0f}'.format
largest_pop.columns = ['population']
largest_pop

Unnamed: 0,population
São Paulo,11152344
Guangzhou,11071424
Istanbul,13710512
Beijing,11716620
Tianjin,11090314
Shanghai,22315474
Delhi,11034555
Mumbai,12442373
Shenzhen,10358381
Moskva,11979529


## 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [17]:
# create dictionary called 'eth' to store ethnicity:population pairs
eth = {}

# search for 'ethnicgroup' tag under every country
for element in document_tree.iterfind('country'):
    for subelement in element.findall('ethnicgroup'):
        if element.find('ethnicgroup') != None:
            
            # if ethnic group is not in eth dictionary, add it to the dictionary with the population
            if element.find('ethnicgroup').text not in eth:
                eth[element.find('ethnicgroup').text] = float(element.find('ethnicgroup').attrib['percentage']) * .01 * int(element.find('population[last()]').text)
            
            # if ethnic group in dictionary, add population to existing population value in dictionary
            else:
                eth[element.find('ethnicgroup').text] = eth[element.find('ethnicgroup').text] + (float(element.find('ethnicgroup').attrib['percentage']) * .01 * int(element.find('population[last()]').text))

# create df_eth dataframe from eth dictionary. Format values to number with commas, 2 decimals
df_eth = pd.DataFrame.from_dict(eth, orient='index')
df_eth.columns = ['population']
pd.options.display.float_format = '{:20,.0f}'.format

# sort df_eth by population and show top 10 values
df_eth.sort_values('population', ascending=False).head(10)


Unnamed: 0,population
European,1481990318
Han Chinese,1245058800
Dravidian,908141233
Russian,573231055
Viet/Kinh,532548627
Mestizo,524233585
Javanese,453824024
English,375146287
Ukrainian,355029696
Oromo,261816665


## 4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [7]:
#create dictionary of country codes to map country into each entry
car_codes = {}

for country in root.findall('country'):
    car_codes[country.get('car_code')] = country.find('name').text

## a) Longest River

In [8]:
# create empty list for rivers. List elements will be lists consisting of [name, length, area] for every river
riverslist = []

# for every river: find rivername, find riverlength, find rivercountry using element.find().text or element.get('attribute')
# append list of [rivername, riverlength, rivercountry] to riverslist
for river in root.findall('river'):
    rivername = river.find('name').text
    if river.find('length') is not None:
        riverlength = float(river.find('length').text)
    else:
        riverarea = np.NaN
    if river.find('located') is None:
        if river.get('country') is None:
            rivercountry = np.Nan
        else:
            rivercountry = river.get('country')
    else:
        rivercountry = river.find('located').attrib['country']
    riverslist.append([rivername, riverlength, rivercountry])


In [9]:
# create dataframe from riverslist and set column names.
df_rivers = pd.DataFrame(riverslist)
df_rivers.columns = ['name', 'length', 'car_code']

# map country names using car_code for every entry.
df_rivers['country'] = df_rivers['car_code'].map(car_codes)

# sort dataframe by length and show top 10 values.
df_rivers.sort_values('length', ascending=False).head(10)

Unnamed: 0,name,length,car_code,country
174,Amazonas,6448.0,CO,Colombia
137,Jangtse,6380.0,CN,China
136,Hwangho,4845.0,CN,China
123,Lena,4400.0,R,Russia
205,Zaire,4374.0,ZRE,Zaire
138,Mekong,4350.0,CN,China
115,Irtysch,4248.0,R,Russia
186,Niger,4184.0,RMM,Mali
160,Missouri,4130.0,USA,United States
119,Jenissej,4092.0,R,Russia


## b) Largest Lake

In [10]:
# create empty list for lakes. List elements will be lists of [name, area, country].
lakeslist = []

# for every lake: find the name, area, and country. Append [name, area, country] to lakes list.
for lake in root.findall('lake'):
    lakename = lake.find('name').text
    if lake.find('area') is not None:
        lakearea = float(lake.find('area').text)
    else:
        lakearea = np.NaN
    if lake.find('located') is None:
        if lake.get('country') is None:
            lakecountry = np.Nan
        else:
            lakecountry = lake.get('country')
    else:
        lakecountry = lake.find('located').attrib['country']
    lakeslist.append([lakename, lakearea, lakecountry])


In [11]:
# create dataframe from lakeslist and set column titles.
df_lakes = pd.DataFrame(lakeslist)
df_lakes.columns = ['name', 'area', 'car_code']

# map country using car_codes.
df_lakes['country'] = df_lakes['car_code'].map(car_codes)

# sort dataframe by 'area' column and show top 10 values.
df_lakes.sort_values('area', ascending=False).head(10)



Unnamed: 0,name,area,car_code,country
54,Caspian Sea,386400.0,R,Russia
109,Lake Superior,82103.0,CDN,Canada
81,Lake Victoria,68870.0,EAT,Tanzania
106,Lake Huron,59600.0,CDN,Canada
108,Lake Michigan,57800.0,USA,United States
47,Dead Sea,41650.0,IL,Israel
83,Lake Tanganjika,32893.0,ZRE,Zaire
98,Great Bear Lake,31792.0,CDN,Canada
43,Ozero Baikal,31492.0,R,Russia
89,Lake Malawi,29600.0,MOC,Mozambique


## c) Airport at Highest Elevation

In [12]:
# create empty list for airports. List elements will be [name, elevation, country] for each airport.
airportslist = []

# for every airport: find name, elevation, and country. Append [name, elevation, country] to airportslist.
for airport in root.findall('airport'):
    airportname = airport.find('name').text
    if airport.find('elevation').text is not None:
        airportelevation = float(airport.find('elevation').text)
    else:
        airportelevation = np.NaN
    airportcountry = airport.get('country')
    airportslist.append([airportname, airportelevation, airportcountry])


In [13]:
# create dataframe from airportslist and set column names. 
df_airports = pd.DataFrame(airportslist)
df_airports.columns = ['name', 'elevation', 'car_code']

# map country names using car_codes dictionary.
df_airports['country'] = df_airports['car_code'].map(car_codes)

# sort dataframe by 'elevation' column and show top 10 values.
df_airports.sort_values('elevation', ascending=False).head(10)

Unnamed: 0,name,elevation,car_code,country
80,El Alto Intl,4063.0,BOL,Bolivia
219,Lhasa-Gonggar,4005.0,CN,China
241,Yushu Batang,3963.0,CN,China
813,Juliaca,3827.0,PE,Peru
815,Teniente Alejandro Velasco Astete Intl,3311.0,PE,Peru
82,Juana Azurduy De Padilla,2905.0,BOL,Bolivia
334,Mariscal Sucre Intl,2813.0,EC,Ecuador
805,Coronel Fap Alfredo Mendivil Duarte,2719.0,PE,Peru
807,Mayor General FAP Armando Revoredo Iglesias Ai...,2677.0,PE,Peru
692,Licenciado Adolfo Lopez Mateos Intl,2581.0,MEX,Mexico
