In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [2]:
def get_value(cell):
    
    """ 
        Extract text from the element, and convert them into numbers when possible. 
        Take a cell tag.
        Return the field value.
    """
    
    text = ''.join(cell.find_all(text=True, recursive=False)).strip().replace(u'\xa0', u'')
    
    if text.replace(" ", "").isnumeric():
        return int(text.replace(" ", ""))
    
    elif text.replace(" ", "").replace(".", "").isnumeric():
        return float(text.replace(" ", ""))
    
    elif text.replace(" ", "").replace(".", "").replace("/", "").isnumeric():
        return text.replace(" ", "")
    
    else:
        return text 

In [3]:
def general_info(soup):
    
    """ 
        Extract General Information from the UNdata database.
        Take a bs4.BeautifulSoup object generated from querying the country's profile page.
        Return a dataframe containing the country's General Information
    """
    
    tables = soup.find_all("table")
    rows = tables[1].find_all("tr")
    
    index = [[get_value(r.find_all("td")[0]) for r in rows]]
    value = [[get_value(r.find_all("td")[-1]) for r in rows]]
    
    data = dict(zip(['Index', 'last'], index+value))
    
    df = pd.DataFrame(data)
    
    return df

In [4]:
def indicator(soup, indicator):
    
    """ 
        Extract Indicators information from the UNdata database.
        Take a bs4.BeautifulSoup object generated from querying the country's profile page
           & the name of the indicator.
        Return a dataframe containing the country's indicators data (3 years). 
    """
    
    tables = soup.find_all("table")
    
    if indicator == 'Economic indicators':
        table = tables[2]
    elif indicator == 'Social indicators':
        table = tables[3]
    elif indicator == 'Environment and infrastructure indicators':
        table = tables[4]
    
    rows = table.find_all("tr")
    col_num = len(rows[0].find_all("td"))
        
    year = [get_value(rows[0].find_all("td")[i]) for i in range(col_num)][1:] 

    col_names = ['Index','Unit']+year
    
    index = [[get_value(r.find_all("td")[0]) for r in rows[1:]]] 

    unit = [[get_value(r.find_all("td")[0].small) for r in rows[1:]]]
    
    value = [[(get_value(r.find_all("td")[i].small)) for r in rows[1:]] for i in range(-col_num+1,0)]
    
        
    data = dict(zip(col_names, index+unit+value))
    
    df = pd.DataFrame(data)
    
    return df

In [5]:
def get_profile(url):
    
    """ 
        Take a url that is a country's profile page.
        Return a list of two dataframes, 
               which contains the country's General Information and 
               Indicators data (3 years) respectively. 
    """
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content,'lxml')

    section = [i.get_text() for i in soup.find_all("summary")]

    GI_df = general_info(soup)

    Indicators_df = pd.concat([indicator(soup, i) for i in section[1:]], keys=section[1:])
    
    Indicators_df['last'] = Indicators_df[[2010, 2015, 2020]].apply(lambda x: x[2020] if (x[2020]!='...' and x[2020]!='... / ...') else x[2015] if (x[2015]!='...' and x[2015]!='... / ...') else x[2010], axis=1)

    return [GI_df, Indicators_df]

In [6]:
def removeAccents(word):
    repl = {'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a',
            'é': 'e', 'ê': 'e',
            'í': 'i',
            'ó': 'o', 'ô': 'o', 'õ': 'o',
            'ú': 'u', 'ü': 'u'}

    new_word = ''.join([repl[c] if c in repl else c for c in word])
    return new_word

In [7]:
def hack_recon_relation(text):
    if text == "Oceania":
        return "Australia and New Zealand"
    else:
        return text

In [8]:
index_url = "http://data.un.org/en/index.html"
response = requests.get(index_url)
soup = BeautifulSoup(response.content,'lxml')

In [9]:
urls = ["http://data.un.org/en/"+u['href'] for u in soup.section.find_all('a', href=True)]
countries_list = [c.previousSibling for c in soup.section.find_all("br")]
countries_list = [removeAccents(str(i).split(' (')[0]) for i in countries_list]
print(len(urls) == len(countries_list))

True


In [10]:
countries_list

['Afghanistan',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bonaire, St. Eustatius & Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Channel Islands',
 'Chile',
 'China',
 'China, Hong Kong SAR',
 'China, Macao SAR',
 'Colombia',
 'Comoros',
 'Congo',
 'Cook Islands',
 'Costa Rica',
 'Cote d’Ivoire',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'Czechia',
 "Dem. People's Rep. Korea",
 'Dem. Rep. of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equ

In [None]:
total = dict()
for i in range(len(urls)):
    profile = get_profile(urls[i])
    total[countries_list[i]] = profile
    print("Finished:" + countries_list[i] + "(" + str(i+1) + "/"+ str(len(urls))+")")

Finished:Afghanistan(1/232)
Finished:Albania(2/232)
Finished:Algeria(3/232)
Finished:American Samoa(4/232)
Finished:Andorra(5/232)
Finished:Angola(6/232)
Finished:Anguilla(7/232)
Finished:Antigua and Barbuda(8/232)
Finished:Argentina(9/232)
Finished:Armenia(10/232)
Finished:Aruba(11/232)
Finished:Australia(12/232)
Finished:Austria(13/232)
Finished:Azerbaijan(14/232)
Finished:Bahamas(15/232)
Finished:Bahrain(16/232)
Finished:Bangladesh(17/232)
Finished:Barbados(18/232)
Finished:Belarus(19/232)
Finished:Belgium(20/232)
Finished:Belize(21/232)
Finished:Benin(22/232)
Finished:Bermuda(23/232)
Finished:Bhutan(24/232)
Finished:Bolivia(25/232)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Finished:Bonaire, St. Eustatius & Saba(26/232)
Finished:Bosnia and Herzegovina(27/232)
Finished:Botswana(28/232)
Finished:Brazil(29/232)
Finished:British Virgin Islands(30/232)
Finished:Brunei Darussalam(31/232)
Finished:Bulgaria(32/232)
Finished:Burkina Faso(33/232)
Finished:Burundi(34/232)
Finished:Cabo Verde(35/232)
Finished:Cambodia(36/232)
Finished:Cameroon(37/232)
Finished:Canada(38/232)
Finished:Cayman Islands(39/232)
Finished:Central African Republic(40/232)
Finished:Chad(41/232)
Finished:Channel Islands(42/232)
Finished:Chile(43/232)
Finished:China(44/232)
Finished:China, Hong Kong SAR(45/232)
Finished:China, Macao SAR(46/232)
Finished:Colombia(47/232)
Finished:Comoros(48/232)
Finished:Congo(49/232)
Finished:Cook Islands(50/232)
Finished:Costa Rica(51/232)
Finished:Cote d’Ivoire(52/232)
Finished:Croatia(53/232)
Finished:Cuba(54/232)
Finished:Curaçao(55/232)
Finished:Cyprus(56/232)
Finished:Czechia(57/232)
Finished:Dem. People's Rep. Korea(58/232)
Finished:Dem. Rep. of the Cong

In [None]:
for c in countries_list:
    total[c][1] = total[c][1][['Index', 'Unit', 'last']] # get 2020 indicators
    total[c][0].columns = ['Index', c]
    total[c][1].columns = ['Index', 'Unit', c]

In [None]:
gi = [total[c][0] for c in countries_list]
GI = gi[0]
for i in range(len(gi))[1:]:
    GI = pd.merge(GI, gi[i], how="outer", on="Index")

In [None]:
GIt = GI.transpose()[1:]
GIt.columns = GI.Index

In [None]:
ind = [total[c][1] for c in countries_list]
Ind = ind[0]
for i in range(len(ind))[1:]:
    Ind = pd.merge(Ind, ind[i], how="outer", on= ["Index", "Unit"])

In [None]:
Indt = Ind.transpose()[2:]
Indt.columns = Ind.Index + Ind.Unit
countries_df = pd.concat([GIt, Indt], axis=1)
countries_df

In [None]:
countries_df['Currency+ISO-4217'] = countries_df['National currency'].apply(lambda x: str(x)[-4:-1]) 

In [None]:
countries_df.to_csv('countries_df.csv')

In [None]:
countries_df['Index'] = countries_df.index 
temp = countries_df [['Index', 'Region']]
recon_relation = temp.groupby('Region')['Index'].apply(list).reset_index()
recon_relation.columns = ['Index', 'Countries']
recon_relation.index = recon_relation ['Index']
recon_relation
recon_relation.index = recon_relation.index.map(hack_recon_relation)
recon_relation = recon_relation.drop(columns=['Index'])
recon_relation

# Regions

In [None]:
index_url = "http://data.un.org/en/regions.html"
response = requests.get(index_url)
soup = BeautifulSoup(response.content,'lxml')

In [None]:
urls = ["http://data.un.org/en/"+u['href'] for u in soup.section.find_all('a', href=True)]
regions_list = [str(i).replace('<td>','').replace('</td>','') for i in [soup.find_all("td")[i*5] for i in range(1,31)] ]
print(len(urls) == len(regions_list))

In [None]:
total = dict()
for i in range(len(urls)):
    profile = get_profile(urls[i])
    total[regions_list[i]] = profile
    print("Finished:" + regions_list[i] + "(" + str(i+1) + "/"+ str(len(urls))+")")

In [None]:
for c in regions_list:
    total[c][1] = total[c][1][['Index', 'Unit', 'last']] # get 2020 indicators
    total[c][0].columns = ['Index', c]
    total[c][1].columns = ['Index', 'Unit', c]

In [None]:
gi = [total[c][0] for c in regions_list]
GI = gi[0]
for i in range(len(gi))[1:]:
    GI = pd.merge(GI, gi[i], how="outer", on="Index")

In [None]:
GIt = GI.transpose()[1:]
GIt.columns = GI.Index

In [None]:
ind = [total[c][1] for c in regions_list]
Ind = ind[0]
for i in range(len(ind))[1:]:
    Ind = pd.merge(Ind, ind[i], how="outer", on= ["Index", "Unit"])

In [None]:
Indt = Ind.transpose()[2:]
Indt.columns = Ind.Index + Ind.Unit

In [None]:
dfm = pd.concat([GIt, Indt], axis=1)

In [None]:
temp=dfm[ dfm['Region'] != 'World']
regions_df =temp[temp.index!='World']
regions_df['Index'] = regions_df.index
regions_df

In [None]:
regions_df_final=regions_df.merge(recon_relation, how='outer', on='Index')
regions_df_final.index = regions_df_final['Index']
regions_df_final

In [None]:
regions_df_final.to_csv('regions_df_final.csv')

In [None]:
regions_df['Index'] = regions_df.index 
temp = regions_df [['Index', 'Region']]
recontin_realtion = temp.groupby('Region')['Index'].apply(list).reset_index()
recontin_realtion.columns = ['Index', 'Regions']
# recontin_realtion.index = recontin_realtion ['Index']
# recon_relation.drop(columns=['Index'], inplace=True)
recontin_realtion

# continents

In [None]:
continents_df =dfm[ dfm['Region'] == 'World']
continents_df['Index']= continents_df.index
continents_df

In [None]:
continents_df_final=continents_df.merge(recontin_realtion, how='inner', on='Index')
continents_df_final.index = continents_df_final['Index']
continents_df_final

In [None]:
continents_df_final.to_csv('continents_df_final.csv')