# XML exercise

Using data from [**mondial database**](https://drive.google.com/file/d/14lFT4nWHgwN36ij4XZh6OUuup-K9qLgR/view?usp=sharing) find the answers to following questions:

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
tree = ET.parse("mondial.xml")

In [3]:
root = tree.getroot()
root

<Element 'mondial' at 0x00000208CFEE7E50>

In [4]:
print(root.tag)
print(root.attrib)
print(len(root))

mondial
{}
3403


In [5]:
# get some info

# First child of the root
country1 = root[1]
# tag of child
print(country1.tag)
# First child of the child
rank = country1[0]
# print(country1[0].text)
# What is the tag of the grandchild
print(rank.tag)
# What is the text inside this grandchild
print(rank.text)
# What are the attributes of 11th element?
print(country1.find('infant_mortality').text)
# print(country1[11].text)

country
name
Greece
4.78


In [15]:
# get 10 countries with the lowest infant mortality rates

mondial_dict = {'name': [],
            'infant_mortality': []}

for country in root:
    name_value = country.find('name').text
    mondial_dict['name'].append(name_value)

    infant_mortality = country.find('infant_mortality')
    if infant_mortality is None:
        mondial_dict['infant_mortality'].append(None)
    else:
        mondial_dict['infant_mortality'].append(float(infant_mortality.text))

df = pd.DataFrame(mondial_dict)
df.nsmallest(10, 'infant_mortality', keep='all')

Unnamed: 0,name,infant_mortality
38,Monaco,1.81
98,Japan,2.13
36,Norway,2.48
117,Bermuda,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
8,Spain,2.7
78,Hong Kong,2.73
79,Macao,3.13


In [29]:
# 10 cities with the largest population

mondial_dict = {'city': [],
                'year': [],
                'measured': [],
                'population': []}

for city in root.findall("./country/city"):
    cityname = city.find('name').text
    
    for population in city.findall("./population"):
        population_value = int(population.text)
        year = int(population.attrib['year'])
        measured = None
        try:
            measured = population.attrib['measured']
        except:
            pass
        mondial_dict['measured'].append(measured)
        mondial_dict['city'].append(cityname)
        mondial_dict['year'].append(year)
        mondial_dict['population'].append(population_value)

df = pd.DataFrame(mondial_dict)
df.sort_values(by=['city', 'year'], ascending=[True, True])
df.groupby(by=['city']).last().nlargest(10,'population')



Unnamed: 0_level_0,year,measured,population
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Seoul,2015,census,9805506
Al Qahirah,2006,census,8471859
Bangkok,2010,census,8305218
Hong Kong,2009,,7055071
Singapore,2010,census,5076700
Al Iskandariyah,2006,census,4123869
New Taipei,2012,estimate,3939305
Busan,2015,census,3440484
Pyongyang,2008,census,3255288
Nairobi,2009,census,3133518


In [43]:
# name and country of a) longest river, b) largest lake and 
# c) airport at highest elevation

# a) longest river

mondial_dict = {'name': [],
                'country': [],
                'length': []}

for river in root.findall("./river"):
    name_value = river.find('name').text
    country_value = river.attrib['country']
    length_value = river.find('length')
    
    if length_value is not None:
        mondial_dict['length'].append(float(length_value.text))
    else:
        mondial_dict['length'].append(None)

    mondial_dict['name'].append(name_value)
    mondial_dict['country'].append(country_value)

df = pd.DataFrame(mondial_dict)
df
df.nlargest(1,'length')


Unnamed: 0,name,country,length
214,Yangtze,CN,6380.0


In [44]:
# b) largest lake

mondial_dict = {'name': [],
                'country': [],
                'area': []}

for lake in root.findall("./lake"):
    name = lake.find('name').text
    country = lake.attrib['country']
    area = lake.find('area')

    if area is not None:
        mondial_dict['area'].append(float(area.text))
    else:
        mondial_dict['area'].append(None)

    mondial_dict['name'].append(name)
    mondial_dict['country'].append(country)

df = pd.DataFrame(mondial_dict)
df.nlargest(1,'area')

Unnamed: 0,name,country,area
59,Caspian Sea,R AZ KAZ IR TM,386400.0


In [48]:
# c) airport at highest elevation

mondial_dict = {'name': [],
                'country': [],
                'elevation': []}

for airport in root.findall("./airport"):
    name = airport.find('name').text
    country = airport.attrib['country']
    elevation = airport.find('elevation')

    if elevation is not None:
        mondial_dict['elevation'].append(float(elevation.text))
    else:
        mondial_dict['elevation'].append(None)

    mondial_dict['name'].append(name)
    mondial_dict['country'].append(country)

df = pd.DataFrame(mondial_dict)
df.nlargest(1,'elevation')

Unnamed: 0,name,country,elevation
81,El Alto Intl,BOL,4063.0
