# XML Mini-Project

In [1]:
# import ElementTree as ET
# import pandas as pd
# import numpy as np

from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

In [2]:
# read in xml file using ET.parse()

document_tree = ET.parse('./data/mondial_database.xml')
root = document_tree.getroot()

In [3]:
# examine all elements in one root element of the tree

list(root[1].iter())

[<Element 'country' at 0x0000000009707098>,
 <Element 'name' at 0x00000000097070E8>,
 <Element 'localname' at 0x0000000009707138>,
 <Element 'population' at 0x0000000009707188>,
 <Element 'population' at 0x00000000097071D8>,
 <Element 'population' at 0x0000000009707228>,
 <Element 'population' at 0x0000000009707278>,
 <Element 'population' at 0x00000000097072C8>,
 <Element 'population' at 0x0000000009707318>,
 <Element 'population' at 0x0000000009707368>,
 <Element 'population' at 0x00000000097073B8>,
 <Element 'population' at 0x0000000009707408>,
 <Element 'population' at 0x0000000009707458>,
 <Element 'population' at 0x00000000097074A8>,
 <Element 'population' at 0x00000000097074F8>,
 <Element 'population' at 0x0000000009707548>,
 <Element 'population' at 0x0000000009707598>,
 <Element 'population' at 0x00000000097075E8>,
 <Element 'population_growth' at 0x0000000009707638>,
 <Element 'infant_mortality' at 0x0000000009707688>,
 <Element 'gdp_total' at 0x00000000097076D8>,
 <Element '

## 1. 10 countries with the lowest infant mortality rates

In [4]:
# create dictionary for country:infant_mortality pairs
infant_mortality = {}

# search for 'infant_mortality' tag. if found, add country 'name' and 'infant_mortality' value to dictionary
for child in document_tree.getroot():
    if child.find('infant_mortality') != None:
        infant_mortality[child.find('name').text] = float(child.find('infant_mortality').text)

# import operator and use to slice list of lowest values from infant_mortality
import operator        
lowest_im = sorted(infant_mortality.items(), key=operator.itemgetter(1), reverse=False)[:10]

# create dataframe for countries with 10 lowest infant mortalities
df_lowest_im = pd.DataFrame(lowest_im)
df_lowest_im.columns = ['country', 'infant_mortality']
df_lowest_im


Unnamed: 0,country,infant_mortality
0,Monaco,1.81
1,Japan,2.13
2,Norway,2.48
3,Bermuda,2.48
4,Singapore,2.53
5,Sweden,2.6
6,Czech Republic,2.63
7,Hong Kong,2.73
8,Macao,3.13
9,Iceland,3.15


## 2. 10 cities with the largest population

In [5]:
# create dictionary for city:population pairs
population = {}

# search for subelement 'city' under every 'country' tag. If found, add city and population from the latest year to dictionary
for element in document_tree.iterfind('country'):
    for subelement in element.getiterator('city'):
        if subelement.find("population[last()]") != None:
            population[subelement.find('name').text] = float(subelement.find("population[last()]").text)

# import Counter and use to create dictionary of 10 highest populations
from collections import Counter           
largest_pop_dict = dict(Counter(population).most_common(10))

# create dataframe for cities with largest populations
largest_pop = pd.DataFrame.from_dict(largest_pop_dict, orient='index')
pd.options.display.float_format = '{:20,.0f}'.format
largest_pop.columns = ['population']
largest_pop

Unnamed: 0,population
Shanghai,22315474
Istanbul,13710512
Mumbai,12442373
Moskva,11979529
Beijing,11716620
São Paulo,11152344
Tianjin,11090314
Guangzhou,11071424
Delhi,11034555
Shenzhen,10358381


## 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [6]:
# create dictionary called 'eth' to store ethnicity:population pairs
eth = {}

# search for 'ethnicgroup' tag under every country
for element in document_tree.iterfind('country'):
    for subelement in element.findall('ethnicgroup'):
        if element.find('ethnicgroup') != None:
            
            # if ethnic group is not in eth dictionary, add it to the dictionary with the population
            if element.find('ethnicgroup').text not in eth:
                #eth[element.find('ethnicgroup').text] = float(element.find('ethnicgroup').attrib['percentage']) * .01 * int(element.find('population[last()]').text)
                eth[element.find('ethnicgroup').text] = float(element.find('ethnicgroup').attrib['percentage']) * .01 * int(element.find('population[last()]').text)
            # if ethnic group in dictionary, add population to existing population value in dictionary
            else:
                eth[element.find('ethnicgroup').text] = eth[element.find('ethnicgroup').text] + (float(element.find('ethnicgroup').attrib['percentage']) * .01 * int(element.find('population[last()]').text))

# create df_eth dataframe from eth dictionary. Format values to number with commas, 2 decimals
df_eth = pd.DataFrame.from_dict(eth, orient='index')
df_eth.columns = ['population']
#pd.options.display.float_format = '{:20,.0f}'.format
df_eth

# sort df_eth by population and show top 10 values
#df_eth.sort_values('population', ascending=False).head(10)


Unnamed: 0,population
Albanian,8510587
Greek,11353373
Macedonian,6611939
Serb,35418193
Montenegrin,1066450
Spanish,167947
Mediterranean Nordic,46815916
Austrian,46459683
Czech,66837690
German,169968839


## 4. name of a) longest river, b) largest lake and c) airport at highest elevation

In [7]:
#create dictionary of country codes to map country into each entry
car_codes = {}

for country in root.findall('country'):
    car_codes[country.get('car_code')] = country.find('name').text

## a) Longest River

In [8]:
# create empty list for rivers. List elements will be lists consisting of [name, length, area] for every river
riverslist = []

# for every river: find rivername, find riverlength, find rivercountry using element.find().text or element.get('attribute')
# append list of [rivername, riverlength, rivercountry] to riverslist
for river in root.findall('river'):
    rivername = river.find('name').text
    if river.find('length') is not None:
        riverlength = float(river.find('length').text)
    else:
        riverarea = np.NaN
    if river.get('country') is not None:
        rivercountry = river.get('country')
    else:
        rivercountry = np.Nan
    riverslist.append([rivername, riverlength, rivercountry])

In [9]:
# create dataframe from riverslist and set column names.
df_rivers = pd.DataFrame(riverslist)
df_rivers.columns = ['name', 'length', 'car_code']

# sort dataframe by length and show top 10 values.
df_rivers.sort_values('length', ascending=False).head(10)

Unnamed: 0,name,length,car_code
174,Amazonas,6448,CO BR PE
137,Jangtse,6380,CN
136,Hwangho,4845,CN
123,Lena,4400,R
205,Zaire,4374,RCB ZRE
138,Mekong,4350,CN LAO THA K VN
115,Irtysch,4248,R KAZ CN
186,Niger,4184,RMM RN WAN RG
160,Missouri,4130,USA
119,Jenissej,4092,R


### b) Largest Lake

In [10]:
# create empty list for lakes. List elements will be lists of [name, area, country].
lakeslist = []

# for every lake: find the name, area, and country. Append [name, area, country] to lakes list.
for lake in root.findall('lake'):
    lakename = lake.find('name').text
    if lake.find('area') is not None:
        lakearea = float(lake.find('area').text)
    else:
        lakearea = np.NaN
    if lake.find('located') is None:
        if lake.get('country') is None:
            lakecountry = np.Nan
        else:
            lakecountry = lake.get('country')
    else:
        lakecountry = lake.find('located').attrib['country']
    lakeslist.append([lakename, lakearea, lakecountry])


In [11]:
# create dataframe from lakeslist and set column titles.
df_lakes = pd.DataFrame(lakeslist)
df_lakes.columns = ['name', 'area', 'car_code']

# map country using car_codes.
df_lakes['country'] = df_lakes['car_code'].map(car_codes)

# sort dataframe by 'area' column and show top 10 values.
df_lakes.sort_values('area', ascending=False).head(10)



Unnamed: 0,name,area,car_code,country
54,Caspian Sea,386400,R,Russia
109,Lake Superior,82103,CDN,Canada
81,Lake Victoria,68870,EAT,Tanzania
106,Lake Huron,59600,CDN,Canada
108,Lake Michigan,57800,USA,United States
47,Dead Sea,41650,IL,Israel
83,Lake Tanganjika,32893,ZRE,Zaire
98,Great Bear Lake,31792,CDN,Canada
43,Ozero Baikal,31492,R,Russia
89,Lake Malawi,29600,MOC,Mozambique


## c) Airport at Highest Elevation

In [12]:
# create empty list for airports. List elements will be [name, elevation, country] for each airport.
airportslist = []

# for every airport: find name, elevation, and country. Append [name, elevation, country] to airportslist.
for airport in root.findall('airport'):
    airportname = airport.find('name').text
    if airport.find('elevation').text is not None:
        airportelevation = float(airport.find('elevation').text)
    else:
        airportelevation = np.NaN
    airportcountry = airport.get('country')
    airportslist.append([airportname, airportelevation, airportcountry])


In [13]:
# create dataframe from airportslist and set column names. 
df_airports = pd.DataFrame(airportslist)
df_airports.columns = ['name', 'elevation', 'car_code']

# map country names using car_codes dictionary.
df_airports['country'] = df_airports['car_code'].map(car_codes)

# sort dataframe by 'elevation' column and show top 10 values.
df_airports.sort_values('elevation', ascending=False).head(10)

Unnamed: 0,name,elevation,car_code,country
80,El Alto Intl,4063,BOL,Bolivia
219,Lhasa-Gonggar,4005,CN,China
241,Yushu Batang,3963,CN,China
813,Juliaca,3827,PE,Peru
815,Teniente Alejandro Velasco Astete Intl,3311,PE,Peru
82,Juana Azurduy De Padilla,2905,BOL,Bolivia
334,Mariscal Sucre Intl,2813,EC,Ecuador
805,Coronel Fap Alfredo Mendivil Duarte,2719,PE,Peru
807,Mayor General FAP Armando Revoredo Iglesias Ai...,2677,PE,Peru
692,Licenciado Adolfo Lopez Mateos Intl,2581,MEX,Mexico
