# Springboard XML Practice
Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find
+ 10 countries with the lowest infant mortality rates
+ 10 cities with the largest population
+ 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries) 
+ name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [1]:
import os

cwd = os.getcwd()  # Get the current working directory (cwd)
files = os.listdir(cwd)  # Get all the files in that directory
print("Files in '%s': %s" % (cwd, files))

Files in 'C:\Users\osutr_000\Documents\data_wrangling_xml': ['.DS_Store', '.ipynb_checkpoints', 'data', 'Data Wrangling - XML.ipynb', 'sliderule_dsi_xml_exercise.ipynb']


In [2]:
import xml.etree.ElementTree as ET
import pandas as pd

In [3]:
# Read XML document

In [4]:
tree = ET.parse('./data/mondial_database.xml')
root = tree.getroot()

## Question 1: Find the 10 countries with lowest infant mortality rates

In [16]:
#Create pandas dataframe to hold values
country_name = []
infant_mortality = []
df = pd.DataFrame(columns = ['country_name','infant_mortality'])
df['infant_mortality'] = df['infant_mortality'].astype(float)


In [6]:
for country in tree.findall('country'):
    for node in country.getiterator(): 
        if node.tag == 'name': #find country name tag
            if country_name == "":
                country_name = (node.text)
        if node.tag == 'infant_mortality': #find mortality rate tag
            infant_mortality = float(node.text)
    df.loc[len(df)] = [country_name,infant_mortality] #add country name and mortality rate to data frame
    country_name = ""

In [7]:
#sort data frame and find top 10 countries with lowest mortalitiy rates
df.sort_values(by = 'infant_mortality').head(10)

Unnamed: 0,country_name,infant_mortality
38,Monaco,1.81
98,Japan,2.13
36,Norway,2.48
117,Bermuda,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


# Question 2: Find the 10 cities with the largest population

In [17]:
#create data frame to hold values
city_name = ""
city_population = ""
tree = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['City_Name','City_Population']) 
df['City_Population'] = df['City_Population'].astype(float)

In [18]:
#loop through country element to find city name and its population
for country in tree.iterfind( 'country' ):
    for city in country.iter('city'): #find all cities within each country element
        city_name = city.find('name').text
        year = int(0)
        for node in city.iterfind('population'): #find all population elements with each city
            year = node.attrib['year']
            if node.attrib['year'] >= year: #store the population number of the latest year
                city_population = int(node.text)
        df.loc[len(df)] = [city_name,city_population] #add city name and its population to data frame
        city_name = ""

In [19]:
#sort data frame
df.sort_values(by = 'City_Population', ascending=False).head(10)

Unnamed: 0,City_Name,City_Population
1341,Shanghai,22315474.0
771,Istanbul,13710512.0
1527,Mumbai,12442373.0
479,Moskva,11979529.0
1340,Beijing,11716620.0
2810,São Paulo,11152344.0
1342,Tianjin,11090314.0
1064,Guangzhou,11071424.0
1582,Delhi,11034555.0
1067,Shenzhen,10358381.0


# Question 3:Find the 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [26]:
#create data frame to hold values
countryname = []
countrypopulation = 0
ethnicpopulation = 0
tree = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['Country','Ethnic Group','Population'])
df['Population'] = df['Population'].astype(float)

In [29]:
#loop through country element to find ethnic groups and its population
for country in tree.iterfind( 'country' ):
    countryname = country.find('name').text #find country name
    year = int(0)
    countrycpopulation = int(0)
    for node in country.iterfind('population'): #find population of the country
        year = node.attrib['year']
        if node.attrib['year'] >= year: #find population of the latest year
            countrypopulation = (node.text) 
    ethnicname = None
    ethnicpopulation = 0
    for ethnic in country.iter('ethnicgroup'): #find all ethnic groups within the same country
        ethnicname = ethnic.text
        #compute each ethnic population: country population * ethnic group percentage
        ethnicpopulation = round(float(ethnic.attrib['percentage']) * 0.01 * int(countrypopulation))
        if ethnicname == None:
            ethnicname = countryname
            ethnicpopulation = countrypopulation
        df.loc[len(df)] = [countryname,ethnicname,ethnicpopulation] #store ethnic group population to data frame
    countryname = ""

df.head()

Unnamed: 0,Country,Ethnic Group,Population
0,Albania,Albanian,2660131.0
1,Albania,Greek,84004.0
2,Greece,Greek,10059146.0
3,Macedonia,Macedonian,1322388.0
4,Macedonia,Albanian,519068.0


In [23]:
#group and sort ethnic groups
df.groupby('Ethnic Group').sum().sort_values(by = 'Population', ascending=False).head(10)

Unnamed: 0_level_0,Population
Ethnic Group,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


# Question 4: Find the name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [48]:
# Establish Variables
country = []
rivername = []
riverlength = 0


#Setup pandas data frame to insert values
tree = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['Country','River Name','River Length'])
df['River Length'] = df['River Length'].astype(float)

Unnamed: 0,Country,River Name,River Length


In [54]:
for river in tree.iterfind('river'):
    country = river.get('country')
    rivername = river.get('id')
    length = river.find('./length')
    if length == None:
        pass
    else:
        riverlength = float(length.text)
    df.loc[len(df)] = [country,rivername,riverlength]

In [55]:
df.sort_values(by='River Length', ascending=False).head(1)

Unnamed: 0,Country,River Name,River Length
412,CO BR PE,river-Amazonas,6448.0


In [62]:
#Establish Variables
country = []
lakename = []
lakesize = 0


#Setup pandas data frame to insert values
tree = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['Country', 'Lake Name', 'Lake Size'])
df['Lake Size'] = df['Lake Size'].astype(float)

Unnamed: 0,Country,Lake Name,Lake Size


In [65]:
for lake in tree.iterfind('lake'):
    country = lake.get('country')
    lakename = lake.get('id')
    size = lake.find('./depth')
    if size == None:
        pass
    else:
        lakesize = float(size.text)
    df.loc[len(df)] = [country,lakename,lakesize]

In [67]:
df.sort_values(by = 'Lake Size', ascending = False).head(1)

Unnamed: 0,Country,Lake Name,Lake Size
184,R,lake-Baikalsee,1637.0


In [107]:
#Establish Variables
country = []
airportname = []
airportelev = 0

#Setup pandas data frame to insert values
tree = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['Country','Airport Name','Airport Elevation'])
df['Airport Elevation'] = df['Airport Elevation'].astype(float)

In [108]:
for airport in tree.iterfind('airport'):
    country = airport.get('country')
    airportname = airport.find('name').text
    elev = airport.find('elevation').text
    if elev == None:
        pass
    else:
        airportelevation = float(elev)
    df.loc[len(df)] = [country,airportname,airportelevation]

In [109]:
    
df.sort_values(by = 'Airport Elevation', ascending = False).head(1)

Unnamed: 0,Country,Airport Name,Airport Elevation
80,BOL,El Alto Intl,4063.0
