In [385]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
import json 
from typing import List, Dict
import os

In [386]:
def get_value(cell):
    
    """ 
        Extract text from the element, and convert them into numbers when possible. 
        Take a cell tag.
        Return the field value.
    """
    
    text = ''.join(cell.find_all(text=True, recursive=False)).strip().replace(u'\xa0', u'')
    
    if text.replace(" ", "").isnumeric():
        return int(text.replace(" ", ""))
    
    elif text.replace(" ", "").replace(".", "").isnumeric():
        return float(text.replace(" ", ""))
    
    elif text.replace(" ", "").replace(".", "").replace("/", "").isnumeric():
        return text.replace(" ", "")
    
    else:
        return text 

In [387]:
def general_info(soup):
    
    """ 
        Extract General Information from the UNdata database.
        Take a bs4.BeautifulSoup object generated from querying the country's profile page.
        Return a dataframe containing the country's General Information
    """
    
    tables = soup.find_all("table")
    rows = tables[1].find_all("tr")
    
    index = [[get_value(r.find_all("td")[0]) for r in rows]]
    value = [[get_value(r.find_all("td")[-1]) for r in rows]]
    
    data = dict(zip(['Index', 'last'], index+value))
    
    df = pd.DataFrame(data)
    
    return df

In [388]:
def indicator(soup, indicator):
    
    """ 
        Extract Indicators information from the UNdata database.
        Take a bs4.BeautifulSoup object generated from querying the country's profile page
           & the name of the indicator.
        Return a dataframe containing the country's indicators data (3 years). 
    """
    
    tables = soup.find_all("table")
    
    if indicator == 'Economic indicators':
        table = tables[2]
    elif indicator == 'Social indicators':
        table = tables[3]
    elif indicator == 'Environment and infrastructure indicators':
        table = tables[4]
    
    rows = table.find_all("tr")
    col_num = len(rows[0].find_all("td"))
        
    year = [get_value(rows[0].find_all("td")[i]) for i in range(col_num)][1:] 

    col_names = ['Index','Unit']+year
    
    index = [[get_value(r.find_all("td")[0]) for r in rows[1:]]] 

    unit = [[get_value(r.find_all("td")[0].small) for r in rows[1:]]]
    
    value = [[(get_value(r.find_all("td")[i].small)) for r in rows[1:]] for i in range(-col_num+1,0)]
    
        
    data = dict(zip(col_names, index+unit+value))
    
    df = pd.DataFrame(data)
    
    return df

In [389]:
def download_file(url):
    local_filename = url.split('/')[-1]
    response = requests.get(url)
    with open('data/'+local_filename, 'wb') as f:
            f.write(response.content)

def read_file(url):
    local_filename = url.split('/')[-1]
    with open('data/'+local_filename, 'rb') as reader:
        dos_content = reader.read()
        
    return dos_content

In [390]:
def get_profile(url):
    
    """ 
        Take a url that is a country's profile page.
        Return a list of two dataframes, 
               which contains the country's General Information and 
               Indicators data (3 years) respectively. 
    """
    
    try:
        local_read_content = read_file(url)
    except:
        download_file(url)
        local_read_content = read_file(url)
    
#     response = requests.get(url)
    
    soup = BeautifulSoup(local_read_content,'lxml')

    section = [i.get_text() for i in soup.find_all("summary")]

    GI_df = general_info(soup)

    Indicators_df = pd.concat([indicator(soup, i) for i in section[1:]], keys=section[1:])
    
    Indicators_df['last'] = Indicators_df[[2010, 2015, 2020]].apply(lambda x: x[2020] if (x[2020]!='...' and x[2020]!='... / ...') else x[2015] if (x[2015]!='...' and x[2015]!='... / ...') else x[2010], axis=1)

    return [GI_df, Indicators_df]

In [391]:
def removeAccents(word):
    repl = {'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a',
            'é': 'e', 'ê': 'e',
            'í': 'i',
            'ç': 'c',
            'ó': 'o', 'ô': 'o', 'õ': 'o',
            'ú': 'u', 'ü': 'u'}

    new_word = ''.join([repl[c] if c in repl else c for c in word])
    return new_word

In [392]:
def hack_recon_relation(text):
    if text == "Oceania":
        return "Australia and New Zealand"
    else:
        return text

In [396]:
def hack_countries(a):
    replace_dict = {
        'Wallis and Futuna Islands': 'Wallis and Futuna',
        'China, Macao SAR' : 'Macau',
        "Cabo Verde": 'Cape Vede',
        "Dem. People's Rep. Korea": 'North Korea',
        'Republic of Korea': 'South Korea',
        'Viet Nam': 'Vietnam',
        'Russian Federation': 'Russia',
        'United Rep. of Tanzania': 'Tanzania',
        'Republic of Moldova': 'Moldova',
        'Timor-Leste': 'Timorleste',
        'Syrian Arab Republic': 'Syria',
        'Saint Vincent & Grenadines': 'Saint Vincent and The Grenadines',
        'State of Palestine': 'Palestine',
        "Lao People's Dem. Rep.": 'Laos',
        'Guinea-Bissau': 'Guineabissau',
        'Cote d’Ivoire': "Cote D'Ivoire",
        'Brunei Darussalam': 'Brunei',
        'Holy See': 'Vatican City'
    }
    b = []
    for country in a:
        if country in replace_dict.keys():
            b.append(replace_dict[country])
        else:
            b.append(country)
    return b

In [394]:
def hack_countries_map_wiki(country_name):
    replace_dict = {
        'Republic of the Congo': 'Congo',
        'Democratic Republic of the Congo': 'Dem. Rep. of the Congo',
        'Czech Republic': 'Czechia',
        'East Timor': 'Timorleste',
        'Guinea-Bissau': 'Guineabissau',
        'Ivory Coast': "Cote D'Ivoire",
        'Sovereign Military Order of Malta': 'Malta',
        'Federated States of Micronesia': 'Micronesia',
        'Sahrawi Arab Democratic Republic':'Western Sahara',
        'Saint Vincent and the Grenadines': 'Saint Vincent and The Grenadines',
        'São Tomé and Príncipe': 'Sao Tome and Principe',
        'United Kingdom and Crown dependencies etc.':  'United Kingdom',
        'United States': 'United States of America',
        'The Gambia': 'Gambia',
        "Côte d'Ivoire": "Cote D'Ivoire",
        "People's Republic of China": 'China'
    }
    res = replace_dict.get(country_name,country_name)
    return res

In [395]:
lang_url = "https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory"
response = requests.get(lang_url)
soup = BeautifulSoup(response.text, 'lxml')

lang_table = soup.find('table',{'class':'wikitable sortable'})


### Determine if there are bullets within the table cell and pull languages for each ###
### If not, simply add the language to the list, ignoring "names" shorter than 3 characters as they are likely erroneous ###
def lists_of_langs(td,col,typ,langs):
    temp = []
    if str(td).find("<ul>") > 0:
        lis = td.findAll('li')
        for li in lis:
            lang = li.get_text()
            lang = clean_name(lang)
            lang = clean_name(lang)
            temp.append(lang)
        langs.append([country,temp,typ])
    else:
        lang = td.get_text()
        lang = clean_name(lang)
        lang = clean_name(lang)
        if len(lang)>3:
            langs.append([country,[lang],typ])
    
### Remove footnotes, parentheses and trailing spaces from text ###
def clean_name(x):
    if x.endswith("]"):
        x = re.sub(r'\[.*\]', '', x)
    start = [m.start() for m in re.finditer("\(", x)]
    if len(start) > 0:
        x = x[:start[0]]
    if x.endswith("%"):
        x = re.sub(r'\d+\%', '', x)
    return x.strip()

In [360]:
langs = []
trs = lang_table.findAll('tr')
counter = 1
for tr in trs:
    tds = tr.findAll('td')
    col = 0
    for td in tds:
        if col == 0:
            country = td.get_text()
            country = clean_name(country)
            country = clean_name(country)

        elif col == 1:
            typ = "Official"
            lists_of_langs(td,col,typ,langs)
        col += 1
    counter += 1

col_names = ['Index','Language','Type']
langs_df = pd.DataFrame(langs, columns = col_names)
langs_df.index = langs_df['Index']
langs_df.drop(columns=['Index','Type'], inplace=True)
langs_df.index = langs_df.index.map(hack_countries_map_wiki)
langs_df

Unnamed: 0_level_0,Language
Index,Unnamed: 1_level_1
Abkhazia,"[Abkhaz, Russian]"
Afghanistan,"[Pashto, Dari]"
Albania,[Albanian]
Algeria,"[Arabic, Tamazight]"
Andorra,[Catalan]
...,...
Venezuela,"[Spanish, Venezuelan Sign Language]"
Vietnam,[Vietnamese]
Yemen,[Arabic]
Zambia,[English]


In [361]:
neigh_url = "https://en.wikipedia.org/wiki/List_of_countries_and_territories_by_land_borders"
response = requests.get(neigh_url)
soup = BeautifulSoup(response.text, 'lxml')

neigh_table = soup.find('table',{'class':'wikitable sortable'})

neighbor_data = neigh_table.find_next('tbody', recursive=False).findChildren('tr', recursive=False)[2:]

def contains_sovereign_country(row) -> bool:
    country_cell = row.find_next('td')
    indicators_of_partial_sovereignity = country_cell.find_all('i')
    return len(indicators_of_partial_sovereignity) == 0


def _remove_text_in_parentheses(country_name):
    return re.sub(r'\(.*?\)', '', country_name).strip()


def _hack_country_name(country_name):
    """
    For purposes of this list, Aruba, Curaçao, Sint Maarten and the Netherlands
    are considered constituent parts of one sovereign state.
    """
    if country_name == 'Netherlands, Kingdom of the':
        return 'Netherlands'
    else:
        return hack_countries_map_wiki(country_name)
    

def get_country_name(row) -> str:
    country_cell = row.find_next('td')
    first_link = country_cell.find_next('a')
    country_name = first_link.text

    country_name = _remove_text_in_parentheses(country_name)
    country_name = _hack_country_name(country_name)
    return country_name


def get_country_names(rows_of_countries):
    return set([
        get_country_name(row) for row in rows_of_countries
        if contains_sovereign_country(row)
    ])


def _is_sovereign_neighbor(neighbor) -> bool:
    return neighbor in countries


def _get_neighbors(neighbor_container) -> List[str]:
    neighbor_links = neighbor_container.find_all('a')
    neighbors = [neighbor_link.text for neighbor_link in neighbor_links
                 if _is_sovereign_neighbor(neighbor_link.text)]
    return neighbors


def get_neighbors_with_border_length(row):
    neighbor_containers = row.find_all(
        'div', {'class': 'mw-collapsible-content'})
    if neighbor_containers:
        neighbor_container = neighbor_containers[-1]
        neighbor_border_pairs = _get_neighbors(neighbor_container)
        return neighbor_border_pairs
    return []


def get_neighbors_of_countries(rows_of_countries, countries):
    neighbors_of_countries = {}

    for row in rows_of_countries:
        country = get_country_name(row)
        if country in countries:
            neighbors = get_neighbors_with_border_length(row)
            neighbors_of_countries[country] = neighbors

    return neighbors_of_countries


# CLEANUP NEIGHBOR DATA

def consolidate(neighbors_of_countries):
    """
    Fix the the imperfections in the data:
    1. country is not in the neighbors of its neighbor
    """
    for country, neighbors in neighbors_of_countries.items():
        for neighbor in neighbors:

            if country not in neighbors_of_countries[neighbor]:
                neighbors_of_countries[neighbor].append(country)

def save_countries_and_neighbors(neighbors_of_countries):
    neighs = []
    for country in countries:       
        neighs.append([country, neighbors_of_countries[country]])
        
    return neighs


countries = get_country_names(neighbor_data)
neighbors_of_countries = get_neighbors_of_countries(neighbor_data, countries)
consolidate(neighbors_of_countries)
neighs=save_countries_and_neighbors(neighbors_of_countries)

In [362]:
col_names = ['Index','Neighbors']
neighs_df = pd.DataFrame(neighs, columns = col_names)
neighs_df.index = neighs_df['Index']
neighs_df.drop(columns=['Index'], inplace=True)
neighs_df.index = neighs_df.index.map(hack_countries_map_wiki)
neighs_df

Unnamed: 0_level_0,Neighbors
Index,Unnamed: 1_level_1
Chile,"[Argentina, Bolivia, Peru]"
Myanmar,"[Bangladesh, India, Laos, Thailand, China]"
Liberia,"[Guinea, Sierra Leone, Cote D'Ivoire]"
Germany,"[Austria, Belgium, Denmark, France, Luxembourg..."
Bangladesh,"[India, Myanmar]"
...,...
Nauru,[]
Ukraine,"[Belarus, Hungary, Moldova, Poland, Romania, R..."
Tunisia,"[Algeria, Libya]"
Senegal,"[Guinea, Mali, Mauritania, Gambia, Guineabissau]"


In [363]:
lan_nei_countries=neighs_df.merge(langs_df, how='outer', on='Index')
lan_nei_countries

Unnamed: 0_level_0,Neighbors,Language
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
Chile,"[Argentina, Bolivia, Peru]",[Spanish]
Myanmar,"[Bangladesh, India, Laos, Thailand, China]",[Burmese]
Liberia,"[Guinea, Sierra Leone, Cote D'Ivoire]",[English]
Germany,"[Austria, Belgium, Denmark, France, Luxembourg...",[German]
Bangladesh,"[India, Myanmar]",[Bengali]
...,...,...
Western Sahara,,"[Tamazight, Arabic, Spanish]"
Somaliland,,"[Arabic, English, Somali]"
South Ossetia,,"[Ossetian, Russian]"
Taiwan,,[Mandarin]


Where are your data on Taiwan?
← Country Classification
Taiwan, China, is not listed as a separate country for World Development Indicators. For most indicators, Taiwan, China, data is not added to the data for China, but Taiwan, China, is added to the world aggregate and the high-income countries aggregate. There are some exceptions: For some agricultural data series received from the Food and Agriculture Organization, Taiwan, China, figures are part of the numbers cited for China (as are Macao, China, and Hong Kong, China). For data on Taiwan, please refer to the statistics published in the following official publications:



In [364]:
def normalize_names(country_name):
    res = country_name
    res = res.lower()
    res = res.replace(',', '')
    res = res.replace('-', '')
#     res = res.replace('republic of the', '')
#     res = res.replace('democratic', '')
    res = res.replace('federated states of', '')
    res = res.replace('cabo ', 'cape ')
    res = res.strip()
    res = res.title()
    res = res.replace(' And ', ' and ')
    res = res.replace(' Of ', ' of ')
    res = res.split(' (')[0]
    return res

def get_continent_pop_prospect():
    url = 'https://www.countries-ofthe-world.com/continents-of-the-world.html'
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                             ' Chrome/87.0.4280.88 Safari/537.36'}

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')

    pop_prospect = {}

    for row in soup.find_all('table')[1].tbody.find_all('tr'):
        tds = row.find_all('td')
        pop_prospect[tds[1].text] = tds[-1].text
        
        
    pop_prospect ["Americas"]  = pop_prospect['North America'] + "(N)/" + pop_prospect['South America'] + "(S)"
    pop_prospect ["Oceania"] = pop_prospect["Australia"]
    return pop_prospect


def get_country_domains():
    url = 'https://www.countries-ofthe-world.com/TLD-list.html'
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                             ' Chrome/87.0.4280.88 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')
    domains = {}
    for table in soup.find_all('table'):
        for row in table.tbody.find_all('tr'):
            tds = row.find_all('td')
#             print(tds)
#             if len(tds) > 1 and not tds[1].has_attr('class'):
            if len(tds) > 1:
                domains[normalize_names(tds[1].text)] = tds[0].text

    return domains

def get_country_timezones():
    url = 'https://www.countries-ofthe-world.com/world-time-zones.html'
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/87.0.4280.88 Safari/537.36'}

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')

    timezones = {}
    for row in soup.tbody.find_all('tr'):
        tds = row.find_all('td')

        if not tds[0].has_attr('class'):
            timezones[normalize_names(tds[0].text)] = tds[1].text
        elif tds[0]['class'][0] == 'mlt':
            url2 = 'https://www.countries-ofthe-world.com/'+tds[0].a['href']
            response2 = requests.get(url2, headers=headers)
            soup2 = BeautifulSoup(response2.content, 'lxml')
            nested_timezones = set()
            for i in soup2.table.tbody.find_all('tr'):
                if len(i.find_all('td')) >= 2:
                    nested_timezones.add(i.find_all('td')[-2].text)
            timezones[tds[0].span.text] = list(nested_timezones)
        elif row.has_attr('class') and row['class'][0] in ['green', 'yellow]':
            if tds[0].text[:-3].split(',')[0] in timezones:
                if type(timezones[tds[0].text[:-3].split(',')[0]]) == str:
                    first_item = timezones[tds[0].text[:-3].split(',')[0]]
                    timezones[tds[0].text[:-3].split(',')[0]] = {first_item}
                timezones[tds[0].text[:-3].split(',')[0]].add(tds[1].text)
            else:
                timezones[tds[0].text[:-3].split(',')[0]] = {tds[1].text}
                
        elif tds[0].has_attr('class') and tds[0]['class'][0] == 'd_state':
            timezones[normalize_names(tds[0].text)] = tds[1].text
            
    for item in timezones.keys():
        if type(timezones[item]) == set:
            timezones[item] = list(timezones[item])
    return timezones

def get_country_codes():
    url = 'https://www.countries-ofthe-world.com/list-of-country-calling-codes.html'
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/87.0.4280.88 Safari/537.36'}

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')

    calling_codes = {}
    for row in soup.tbody.find_all('tr'):
#         if not row.find_all('td')[1].has_attr('class'):
        if len(row.find_all('td')[1].text)>1:
            country_name = normalize_names(row.find_all('td')[1].text)
            code = row.find_all('td')[0].text
            calling_codes[country_name] = code
    del calling_codes['Australian External Territories']
    return calling_codes

pop_prospect_data = get_continent_pop_prospect()
code_data = get_country_codes()
timezones_data = get_country_timezones()
domains_data = get_country_domains()

In [365]:
timezones_data['Antigua and Barbuda']

'−04:00'

In [366]:
timezones_data

{'Afghanistan': '+04:30',
 'Aland Islands': '+02:00',
 'Albania': '+01:00',
 'Algeria': '+01:00',
 'American Samoa': '−11:00',
 'Andorra': '+01:00',
 'Angola': '+01:00',
 'Anguilla': '−04:00',
 'Antigua and Barbuda': '−04:00',
 'Argentina': '−03:00',
 'Armenia': '+04:00',
 'Aruba': '−04:00',
 'Ascension Island': '±00:00',
 'Australia': ['+10:30', '+09:30', '+08:00', '+10:00', '+11:00'],
 'Austria': '+01:00',
 'Azerbaijan': '+04:00',
 'Bahamas': '−05:00',
 'Bahrain': '+03:00',
 'Bangladesh': '+06:00',
 'Barbados': '−04:00',
 'Belarus': '+03:00',
 'Belgium': '+01:00',
 'Belize': '−06:00',
 'Benin': '+01:00',
 'Bermuda': '−04:00',
 'Bhutan': '+06:00',
 'Bolivia': '−04:00',
 'Bonaire': '−04:00',
 'Bosnia and Herzegovina': '+01:00',
 'Botswana': '+02:00',
 'Brazil': ['−05:00', '−02:00', '−04:00', '−03:00'],
 'British Indian Ocean Territory': '+06:00',
 'British Virgin Islands': '−04:00',
 'Brunei': '+08:00',
 'Bulgaria': '+02:00',
 'Burkina Faso': '±00:00',
 'Burundi': '+02:00',
 'Cape Verd

In [367]:
domains_data

{'Ascension Island': '.ac',
 'Andorra': '.ad',
 'United Arab Emirates': '.ae',
 'Afghanistan': '.af',
 'Antigua and Barbuda': '.ag',
 'Anguilla': '.ai',
 'Albania': '.al',
 'Armenia': '.am',
 'Angola': '.ao',
 'Argentina': '.ar',
 'American Samoa': '.as',
 'Austria': '.at',
 'Australia': '.au',
 'Aruba': '.aw',
 'Aland Islands': '.ax',
 'Azerbaijan': '.az',
 'Bosnia and Herzegovina': '.ba',
 'Barbados': '.bb',
 'Bangladesh': '.bd',
 'Belgium': '.be',
 'Burkina Faso': '.bf',
 'Bulgaria': '.bg',
 'Bahrain': '.bh',
 'Burundi': '.bi',
 'Benin': '.bj',
 'Bermuda': '.bm',
 'Brunei': '.bn',
 'Bolivia': '.bo',
 'Brazil': '.br',
 'Bahamas': '.bs',
 'Bhutan': '.bt',
 'Bouvet Island': '.bv',
 'Botswana': '.bw',
 'Belarus': '.by',
 'Belize': '.bz',
 'Canada': '.ca',
 'Cocos': '.cc',
 'Congo': '.cg',
 'Central African Republic': '.cf',
 'Switzerland': '.ch',
 "Cote D'Ivoire": '.ci',
 'Cook Islands': '.ck',
 'Chile': '.cl',
 'Cameroon': '.cm',
 'China': '.cn',
 'Colombia': '.co',
 'Costa Rica': '.cr

In [368]:
if not os.path.exists('data'):
    os.makedirs('data')

In [369]:
index_url = "http://data.un.org/en/index.html"
response = requests.get(index_url)
soup = BeautifulSoup(response.content,'lxml')

In [370]:
urls = ["http://data.un.org/en/"+u['href'] for u in soup.section.find_all('a', href=True)]
countries_list = [c.previousSibling for c in soup.section.find_all("br")]
countries_list = [removeAccents(str(i).split(' (')[0]) for i in countries_list]
countries_list = hack_countries(countries_list)
print(len(urls) == len(countries_list))

True


In [371]:
total = dict()
for i in range(len(urls)):
    profile = get_profile(urls[i])
    total[countries_list[i]] = profile
    print("Finished:" + countries_list[i] + "(" + str(i+1) + "/"+ str(len(urls))+")")

Finished:Afghanistan(1/232)
Finished:Albania(2/232)
Finished:Algeria(3/232)
Finished:American Samoa(4/232)
Finished:Andorra(5/232)
Finished:Angola(6/232)
Finished:Anguilla(7/232)
Finished:Antigua and Barbuda(8/232)
Finished:Argentina(9/232)
Finished:Armenia(10/232)
Finished:Aruba(11/232)
Finished:Australia(12/232)
Finished:Austria(13/232)
Finished:Azerbaijan(14/232)
Finished:Bahamas(15/232)
Finished:Bahrain(16/232)
Finished:Bangladesh(17/232)
Finished:Barbados(18/232)
Finished:Belarus(19/232)
Finished:Belgium(20/232)
Finished:Belize(21/232)
Finished:Benin(22/232)
Finished:Bermuda(23/232)
Finished:Bhutan(24/232)
Finished:Bolivia(25/232)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Finished:Bonaire, St. Eustatius & Saba(26/232)
Finished:Bosnia and Herzegovina(27/232)
Finished:Botswana(28/232)
Finished:Brazil(29/232)
Finished:British Virgin Islands(30/232)
Finished:Brunei(31/232)
Finished:Bulgaria(32/232)
Finished:Burkina Faso(33/232)
Finished:Burundi(34/232)
Finished:Cabo Verde(35/232)
Finished:Cambodia(36/232)
Finished:Cameroon(37/232)
Finished:Canada(38/232)
Finished:Cayman Islands(39/232)
Finished:Central African Republic(40/232)
Finished:Chad(41/232)
Finished:Channel Islands(42/232)
Finished:Chile(43/232)
Finished:China(44/232)
Finished:China, Hong Kong SAR(45/232)
Finished:China, Macao SAR(46/232)
Finished:Colombia(47/232)
Finished:Comoros(48/232)
Finished:Congo(49/232)
Finished:Cook Islands(50/232)
Finished:Costa Rica(51/232)
Finished:Cote D'Ivoire(52/232)
Finished:Croatia(53/232)
Finished:Cuba(54/232)
Finished:Curacao(55/232)
Finished:Cyprus(56/232)
Finished:Czechia(57/232)
Finished:North Korea(58/232)
Finished:Dem. Rep. of the Congo(59/232)
Finished:Denma

In [372]:
for c in countries_list:
    total[c][1] = total[c][1][['Index', 'Unit', 'last']] # get 2020 indicators
    total[c][0].columns = ['Index', c]
    total[c][1].columns = ['Index', 'Unit', c]

In [373]:
gi = [total[c][0] for c in countries_list]
GI = gi[0]
for i in range(len(gi))[1:]:
    GI = pd.merge(GI, gi[i], how="outer", on="Index")

In [374]:
GIt = GI.transpose()[1:]
GIt.columns = GI.Index

In [375]:
ind = [total[c][1] for c in countries_list]
Ind = ind[0]
for i in range(len(ind))[1:]:
    Ind = pd.merge(Ind, ind[i], how="outer", on= ["Index", "Unit"])

In [376]:
Indt = Ind.transpose()[2:]
Indt.columns = Ind.Index + Ind.Unit
countries_df = pd.concat([GIt, Indt], axis=1)
countries_df['Index'] = countries_df.index 
countries_df

Unnamed: 0,Region,"Population(000, 2020)","Pop. density(per km2, 2020)",Capital city,"Capital city pop.(000, 2020)",UN membership date,Surface area(km2),Sex ratio(m per 100 f),National currency,Exchange rate(per US$),...,Energy supply per capita(Gigajoules),Important sites for terrestrial biodiversity protected(%),Net Official Development Assist. received(% of GNI),Research & Development expenditure(% of GDP),CO2 emission estimates(million tons/tons per capita),Tourist/visitor arrivals at national borders(000),Pop. using safely managed sanitation(urban/rural %),"Pop. using safely managed drinking water(urban/rural, %)",Net Official Development Assist. disbursed(% of GNI),Index
Afghanistan,Southern Asia,38928,59.6,Kabul,4114,19-Nov-46,652864,105.4,Afghani (AFN),78.4,...,3,5.7,19.46,,,,,,,Afghanistan
Albania,Southern Europe,2878,105,Tirana,484.6,14-Dec-55,28748,103.7,Lek (ALL),108.6,...,34,57.2,2.29,0.2,4.3/1.5,5340,40.2/39.4,,,Albania
Algeria,Northern Africa,43851,18.4,Algiers,2729.3,08-Oct-62,2381741,102.1,Algerian Dinar (DZD),119.2,...,55,16.6,0.08,0.5,130.5/3.2,2657,16.5/20.8,,,Algeria
American Samoa,Polynesia,56,279,Pago Pago,48.5,,199,103.6,US Dollar (USD),,...,,71.1,,0.4,,20,,,,American Samoa
Andorra,Southern Europe,77,164.2,Andorra la Vella,22.6,28-Jul-93,468,102.3,Euro (EUR),0.9,...,117,26.1,,,,3042,100.0/100.0,,,Andorra
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wallis and Futuna Islands,Polynesia,12,82.5,Matu-Utu,1,,142,93.4,CFP Franc (XPF),106.2,...,30,,,,,,,,,Wallis and Futuna Islands
Western Sahara,Northern Africa,597,2.2,El Aai?n,232.4,,266000,109.5,Moroccan Dirham (MAD),9.6,...,,,,,,,,,,Western Sahara
Yemen,Western Asia,29826,56.5,Sana'a,2874.4,30-Sep-47,527968,101.5,Yemeni Rial (YER),,...,5,19.4,29.67,,8.9/0.3,367,67.0/...,,,Yemen
Zambia,Eastern Africa,18384,24.7,Lusaka,2646.6,01-Dec-64,752612,98.1,Zambian Kwacha (ZMW),14.1,...,29,45.5,3.84,0.3,6.0/0.4,1072,,46.2/...,,Zambia


In [377]:
countries_df_final=countries_df.merge(lan_nei_countries, how='outer', on='Index')
countries_df_final.index = countries_df_final['Index']
countries_df_final

Unnamed: 0_level_0,Region,"Population(000, 2020)","Pop. density(per km2, 2020)",Capital city,"Capital city pop.(000, 2020)",UN membership date,Surface area(km2),Sex ratio(m per 100 f),National currency,Exchange rate(per US$),...,Net Official Development Assist. received(% of GNI),Research & Development expenditure(% of GDP),CO2 emission estimates(million tons/tons per capita),Tourist/visitor arrivals at national borders(000),Pop. using safely managed sanitation(urban/rural %),"Pop. using safely managed drinking water(urban/rural, %)",Net Official Development Assist. disbursed(% of GNI),Index,Neighbors,Language
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,Southern Asia,38928,59.6,Kabul,4114,19-Nov-46,652864,105.4,Afghani (AFN),78.4,...,19.46,,,,,,,Afghanistan,"[Iran, Pakistan, Tajikistan, Turkmenistan, Uzb...","[Pashto, Dari]"
Albania,Southern Europe,2878,105,Tirana,484.6,14-Dec-55,28748,103.7,Lek (ALL),108.6,...,2.29,0.2,4.3/1.5,5340,40.2/39.4,,,Albania,"[Greece, North Macedonia, Montenegro]",[Albanian]
Algeria,Northern Africa,43851,18.4,Algiers,2729.3,08-Oct-62,2381741,102.1,Algerian Dinar (DZD),119.2,...,0.08,0.5,130.5/3.2,2657,16.5/20.8,,,Algeria,"[Libya, Mali, Mauritania, Morocco, Niger, Tuni...","[Arabic, Tamazight]"
American Samoa,Polynesia,56,279,Pago Pago,48.5,,199,103.6,US Dollar (USD),,...,,0.4,,20,,,,American Samoa,,
Andorra,Southern Europe,77,164.2,Andorra la Vella,22.6,28-Jul-93,468,102.3,Euro (EUR),0.9,...,,,,3042,100.0/100.0,,,Andorra,"[France, Spain]",[Catalan]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Northern Cyprus,,,,,,,,,,,...,,,,,,,,Northern Cyprus,,[Turkish]
Somaliland,,,,,,,,,,,...,,,,,,,,Somaliland,,"[Arabic, English, Somali]"
South Ossetia,,,,,,,,,,,...,,,,,,,,South Ossetia,,"[Ossetian, Russian]"
Taiwan,,,,,,,,,,,...,,,,,,,,Taiwan,,[Mandarin]


In [378]:
countries_df_final['Currency+ISO-4217'] = countries_df_final['National currency'].apply(lambda x: str(x)[-4:-1] if str(x)!=str(np.nan) else np.nan) 

In [379]:
countries_df_final['Internet Domain'] = countries_df_final['Index'].apply(lambda x: domains_data.get(str(x), np.nan))

In [380]:
countries_df_final['UTC time offset'] = countries_df_final['Index'].apply(lambda x: timezones_data.get(str(x), np.nan))

In [381]:
countries_df_final['Code'] = countries_df_final['Index'].apply(lambda x: code_data.get(str(x), np.nan))

In [382]:
# countries_df_final.to_csv('countries_df_final.csv', encoding="utf-16")
countries_df_final.to_csv('countries_df_final.csv')

In [281]:
temp = countries_df [['Index', 'Region']]
recon_relation = temp.groupby('Region')['Index'].apply(list).reset_index()
recon_relation.columns = ['Index', 'Countries']
recon_relation.index = recon_relation ['Index']
recon_relation
recon_relation.index = recon_relation.index.map(hack_recon_relation)
recon_relation = recon_relation.drop(columns=['Index'])
recon_relation

Unnamed: 0_level_0,Countries
Index,Unnamed: 1_level_1
Caribbean,"[Anguilla, Antigua and Barbuda, Aruba, Bahamas..."
Central America,"[Belize, Costa Rica, El Salvador, Guatemala, H..."
Central Asia,"[Kazakhstan, Kyrgyzstan, Tajikistan, Turkmenis..."
Eastern Africa,"[Burundi, Comoros, Djibouti, Eritrea, Ethiopia..."
Eastern Asia,"[China, China, Hong Kong SAR, China, Macao SAR..."
Eastern Europe,"[Belarus, Bulgaria, Czechia, Hungary, Poland, ..."
Melanesia,"[Fiji, New Caledonia, Papua New Guinea, Solomo..."
Micronesia,"[Guam, Kiribati, Marshall Islands, Micronesia,..."
Middle Africa,"[Angola, Cameroon, Central African Republic, C..."
Northern Africa,"[Algeria, Egypt, Libya, Morocco, Sudan, Tunisi..."


# Regions

In [143]:
index_url = "http://data.un.org/en/regions.html"
response = requests.get(index_url)
soup = BeautifulSoup(response.content,'lxml')

In [144]:
urls = ["http://data.un.org/en/"+u['href'] for u in soup.section.find_all('a', href=True)]
regions_list = [str(i).replace('<td>','').replace('</td>','') for i in [soup.find_all("td")[i*5] for i in range(1,31)] ]
print(len(urls) == len(regions_list))

True


In [145]:
total = dict()
for i in range(len(urls)):
    profile = get_profile(urls[i])
    total[regions_list[i]] = profile
    print("Finished:" + regions_list[i] + "(" + str(i+1) + "/"+ str(len(urls))+")")

Finished:World(1/30)
Finished:Africa(2/30)
Finished:Northern Africa(3/30)
Finished:Sub-Saharan Africa(4/30)
Finished:Eastern Africa(5/30)
Finished:Middle Africa(6/30)
Finished:Southern Africa(7/30)
Finished:Western Africa(8/30)
Finished:Americas(9/30)
Finished:Northern America(10/30)
Finished:Latin America and the Caribbean(11/30)
Finished:Caribbean(12/30)
Finished:Central America(13/30)
Finished:South America(14/30)
Finished:Asia(15/30)
Finished:Central Asia(16/30)
Finished:Eastern Asia(17/30)
Finished:South-eastern Asia(18/30)
Finished:Southern Asia(19/30)
Finished:Western Asia(20/30)
Finished:Europe(21/30)
Finished:Eastern Europe(22/30)
Finished:Northern Europe(23/30)
Finished:Southern Europe(24/30)
Finished:Western Europe(25/30)
Finished:Oceania(26/30)
Finished:Australia and New Zealand(27/30)
Finished:Melanesia(28/30)
Finished:Micronesia(29/30)
Finished:Polynesia(30/30)


In [146]:
for c in regions_list:
    total[c][1] = total[c][1][['Index', 'Unit', 'last']] # get 2020 indicators
    total[c][0].columns = ['Index', c]
    total[c][1].columns = ['Index', 'Unit', c]

In [147]:
gi = [total[c][0] for c in regions_list]
GI = gi[0]
for i in range(len(gi))[1:]:
    GI = pd.merge(GI, gi[i], how="outer", on="Index")

In [148]:
GIt = GI.transpose()[1:]
GIt.columns = GI.Index

In [149]:
ind = [total[c][1] for c in regions_list]
Ind = ind[0]
for i in range(len(ind))[1:]:
    Ind = pd.merge(Ind, ind[i], how="outer", on= ["Index", "Unit"])

In [150]:
Indt = Ind.transpose()[2:]
Indt.columns = Ind.Index + Ind.Unit

In [151]:
dfm = pd.concat([GIt, Indt], axis=1)

In [152]:
temp=dfm[ dfm['Region'] != 'World']
regions_df =temp[temp.index!='World']
regions_df['Index'] = regions_df.index
regions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,"Population(000, 2020)","Pop. density(per km2, 2020)",Surface area(km2),Sex ratio(m per 100 f),Region,GDP: Gross domestic product(million current US$),"GDP growth rate(annual %, const. 2015 prices)",GDP per capita(current US$),Employment in agriculture(% of employed),Employment in industry(% of employed),...,Individuals using the Internet(per 100 inhabitants),Research & Development expenditure(% of GDP),Forested area(% of land area),"Energy production, primary(Petajoules)",Energy supply per capita(Gigajoules),Important sites for terrestrial biodiversity protected(%),"Pop. using safely managed drinking water(urban/rural, %)",Pop. using safely managed sanitation(urban/rural %),Net Official Development Assist. received(% of GNI),Index
Northern Africa,246232,31.7,7880000,101.0,Africa,666216.0,3.6,2814.3,24.8,25.6,...,44.1,0.6,3.4,,,34.9,,39.3/...,0.85,Northern Africa
Sub-Saharan Africa,1094366,50.0,22431000,99.6,Africa,1699027.0,3.0,1637.6,52.6,11.2,...,26.0,0.4,28.0,,,41.0,50.0/11.6,19.8/17.5,2.9,Sub-Saharan Africa
Eastern Africa,445406,66.8,7005000,98.5,Sub-Saharan Africa,393669.0,6.0,934.1,63.7,8.7,...,17.6,,32.6,,,,53.2/8.9,,,Eastern Africa
Middle Africa,179595,27.6,6613000,99.6,Sub-Saharan Africa,250276.0,1.0,1479.9,,,...,13.9,,46.7,,,,,,,Middle Africa
Southern Africa,67504,25.5,2675000,96.9,Sub-Saharan Africa,408569.0,0.9,6215.0,6.6,23.0,...,54.9,,10.4,,,,81.9/...,,,Southern Africa
Western Africa,401861,66.3,6138000,101.4,Sub-Saharan Africa,646513.0,3.2,1696.0,39.9,13.9,...,35.8,,11.1,,,,33.6/15.4,21.9/17.0,,Western Africa
Northern America,368870,19.8,21776000,98.0,Americas,22302188.0,2.8,61220.9,1.3,19.7,...,89.2,2.7,35.5,,,41.4,99.6/...,90.2/...,,Northern America
Latin America and the Caribbean,653962,32.5,20546000,96.8,Americas,5565516.0,0.5,8682.9,13.7,20.3,...,66.3,0.7,46.5,,,38.0,82.3/41.7,37.0/...,,Latin America and the Caribbean
Caribbean,43532,192.6,234000,97.5,Latin America & Caribbean,371468.0,1.0,8800.3,16.0,15.3,...,58.7,,32.2,,,,,,,Caribbean
Central America,179670,73.3,2480000,96.1,Latin America & Caribbean,1492059.0,2.1,8503.1,16.0,24.2,...,,,35.2,,,,.../41.6,44.0/...,,Central America


In [153]:
regions_df_final=regions_df.merge(recon_relation, how='outer', on='Index')
regions_df_final.index = regions_df_final['Index']
regions_df_final

Unnamed: 0_level_0,"Population(000, 2020)","Pop. density(per km2, 2020)",Surface area(km2),Sex ratio(m per 100 f),Region,GDP: Gross domestic product(million current US$),"GDP growth rate(annual %, const. 2015 prices)",GDP per capita(current US$),Employment in agriculture(% of employed),Employment in industry(% of employed),...,Research & Development expenditure(% of GDP),Forested area(% of land area),"Energy production, primary(Petajoules)",Energy supply per capita(Gigajoules),Important sites for terrestrial biodiversity protected(%),"Pop. using safely managed drinking water(urban/rural, %)",Pop. using safely managed sanitation(urban/rural %),Net Official Development Assist. received(% of GNI),Index,Countries
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Northern Africa,246232,31.7,7880000,101.0,Africa,666216.0,3.6,2814.3,24.8,25.6,...,0.6,3.4,,,34.9,,39.3/...,0.85,Northern Africa,"[Algeria, Egypt, Libya, Morocco, Sudan, Tunisi..."
Sub-Saharan Africa,1094366,50.0,22431000,99.6,Africa,1699027.0,3.0,1637.6,52.6,11.2,...,0.4,28.0,,,41.0,50.0/11.6,19.8/17.5,2.9,Sub-Saharan Africa,
Eastern Africa,445406,66.8,7005000,98.5,Sub-Saharan Africa,393669.0,6.0,934.1,63.7,8.7,...,,32.6,,,,53.2/8.9,,,Eastern Africa,"[Burundi, Comoros, Djibouti, Eritrea, Ethiopia..."
Middle Africa,179595,27.6,6613000,99.6,Sub-Saharan Africa,250276.0,1.0,1479.9,,,...,,46.7,,,,,,,Middle Africa,"[Angola, Cameroon, Central African Republic, C..."
Southern Africa,67504,25.5,2675000,96.9,Sub-Saharan Africa,408569.0,0.9,6215.0,6.6,23.0,...,,10.4,,,,81.9/...,,,Southern Africa,"[Botswana, Eswatini, Lesotho, Namibia, South A..."
Western Africa,401861,66.3,6138000,101.4,Sub-Saharan Africa,646513.0,3.2,1696.0,39.9,13.9,...,,11.1,,,,33.6/15.4,21.9/17.0,,Western Africa,"[Benin, Burkina Faso, Cabo Verde, Cote D'Ivoir..."
Northern America,368870,19.8,21776000,98.0,Americas,22302188.0,2.8,61220.9,1.3,19.7,...,2.7,35.5,,,41.4,99.6/...,90.2/...,,Northern America,"[Bermuda, Canada, Greenland, Saint Pierre and ..."
Latin America and the Caribbean,653962,32.5,20546000,96.8,Americas,5565516.0,0.5,8682.9,13.7,20.3,...,0.7,46.5,,,38.0,82.3/41.7,37.0/...,,Latin America and the Caribbean,
Caribbean,43532,192.6,234000,97.5,Latin America & Caribbean,371468.0,1.0,8800.3,16.0,15.3,...,,32.2,,,,,,,Caribbean,"[Anguilla, Antigua and Barbuda, Aruba, Bahamas..."
Central America,179670,73.3,2480000,96.1,Latin America & Caribbean,1492059.0,2.1,8503.1,16.0,24.2,...,,35.2,,,,.../41.6,44.0/...,,Central America,"[Belize, Costa Rica, El Salvador, Guatemala, H..."


In [154]:
regions_df_final.to_csv('regions_df_final.csv')

In [155]:
regions_df['Index'] = regions_df.index 
temp = regions_df [['Index', 'Region']]
recontin_realtion = temp.groupby('Region')['Index'].apply(list).reset_index()
recontin_realtion.columns = ['Index', 'Regions']
# recontin_realtion.index = recontin_realtion ['Index']
# recon_relation.drop(columns=['Index'], inplace=True)
recontin_realtion

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Index,Regions
0,Africa,"[Northern Africa, Sub-Saharan Africa]"
1,Americas,"[Northern America, Latin America and the Carib..."
2,Asia,"[Central Asia, Eastern Asia, South-eastern Asi..."
3,Europe,"[Eastern Europe, Northern Europe, Southern Eur..."
4,Latin America & Caribbean,"[Caribbean, Central America, South America]"
5,Oceania,"[Australia and New Zealand, Melanesia, Microne..."
6,Sub-Saharan Africa,"[Eastern Africa, Middle Africa, Southern Afric..."


# continents

In [156]:
continents_df =dfm[ dfm['Region'] == 'World']
continents_df['Index']= continents_df.index
continents_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,"Population(000, 2020)","Pop. density(per km2, 2020)",Surface area(km2),Sex ratio(m per 100 f),Region,GDP: Gross domestic product(million current US$),"GDP growth rate(annual %, const. 2015 prices)",GDP per capita(current US$),Employment in agriculture(% of employed),Employment in industry(% of employed),...,Individuals using the Internet(per 100 inhabitants),Research & Development expenditure(% of GDP),Forested area(% of land area),"Energy production, primary(Petajoules)",Energy supply per capita(Gigajoules),Important sites for terrestrial biodiversity protected(%),"Pop. using safely managed drinking water(urban/rural, %)",Pop. using safely managed sanitation(urban/rural %),Net Official Development Assist. received(% of GNI),Index
Africa,1340598,45.2,30311000,99.9,World,2365243.0,3.2,1856.2,48.7,13.3,...,,,21.0,45629.0,26.0,,,,2.44,Africa
Americas,1022832,24.2,42322000,97.2,World,27867704.0,2.3,27721.7,9.0,20.0,...,,,41.2,,,,,,0.23,Americas
Asia,4641055,149.6,31915000,104.7,World,31839900.0,4.5,6982.1,,,...,47.3,,19.1,268302.0,62.0,,85.8/59.8,42.7/46.0,0.24,Asia
Europe,747636,33.8,23049000,93.4,World,21908700.0,2.0,29278.7,,,...,82.0,1.9,45.9,104101.0,144.0,65.6,,81.1/53.3,0.61,Europe
Oceania,42678,5.0,8564000,100.2,World,1711770.0,2.0,41469.2,,,...,68.2,,20.4,17969.0,163.0,33.7,96.1/...,66.7/23.5,6.98,Oceania


In [157]:
continents_df_final=continents_df.merge(recontin_realtion, how='inner', on='Index')
continents_df_final.index = continents_df_final['Index']
continents_df_final

Unnamed: 0_level_0,"Population(000, 2020)","Pop. density(per km2, 2020)",Surface area(km2),Sex ratio(m per 100 f),Region,GDP: Gross domestic product(million current US$),"GDP growth rate(annual %, const. 2015 prices)",GDP per capita(current US$),Employment in agriculture(% of employed),Employment in industry(% of employed),...,Research & Development expenditure(% of GDP),Forested area(% of land area),"Energy production, primary(Petajoules)",Energy supply per capita(Gigajoules),Important sites for terrestrial biodiversity protected(%),"Pop. using safely managed drinking water(urban/rural, %)",Pop. using safely managed sanitation(urban/rural %),Net Official Development Assist. received(% of GNI),Index,Regions
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Africa,1340598,45.2,30311000,99.9,World,2365243.0,3.2,1856.2,48.7,13.3,...,,21.0,45629.0,26.0,,,,2.44,Africa,"[Northern Africa, Sub-Saharan Africa]"
Americas,1022832,24.2,42322000,97.2,World,27867704.0,2.3,27721.7,9.0,20.0,...,,41.2,,,,,,0.23,Americas,"[Northern America, Latin America and the Carib..."
Asia,4641055,149.6,31915000,104.7,World,31839900.0,4.5,6982.1,,,...,,19.1,268302.0,62.0,,85.8/59.8,42.7/46.0,0.24,Asia,"[Central Asia, Eastern Asia, South-eastern Asi..."
Europe,747636,33.8,23049000,93.4,World,21908700.0,2.0,29278.7,,,...,1.9,45.9,104101.0,144.0,65.6,,81.1/53.3,0.61,Europe,"[Eastern Europe, Northern Europe, Southern Eur..."
Oceania,42678,5.0,8564000,100.2,World,1711770.0,2.0,41469.2,,,...,,20.4,17969.0,163.0,33.7,96.1/...,66.7/23.5,6.98,Oceania,"[Australia and New Zealand, Melanesia, Microne..."


In [158]:
continents_df_final['Population prospect by 2050'] = continents_df_final['Index'].apply(lambda x: pop_prospect_data.get(str(x), 'Not defined'))

In [159]:
continents_df_final

Unnamed: 0_level_0,"Population(000, 2020)","Pop. density(per km2, 2020)",Surface area(km2),Sex ratio(m per 100 f),Region,GDP: Gross domestic product(million current US$),"GDP growth rate(annual %, const. 2015 prices)",GDP per capita(current US$),Employment in agriculture(% of employed),Employment in industry(% of employed),...,Forested area(% of land area),"Energy production, primary(Petajoules)",Energy supply per capita(Gigajoules),Important sites for terrestrial biodiversity protected(%),"Pop. using safely managed drinking water(urban/rural, %)",Pop. using safely managed sanitation(urban/rural %),Net Official Development Assist. received(% of GNI),Index,Regions,Population prospect by 2050
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Africa,1340598,45.2,30311000,99.9,World,2365243.0,3.2,1856.2,48.7,13.3,...,21.0,45629.0,26.0,,,,2.44,Africa,"[Northern Africa, Sub-Saharan Africa]",1850475700
Americas,1022832,24.2,42322000,97.2,World,27867704.0,2.3,27721.7,9.0,20.0,...,41.2,,,,,,0.23,Americas,"[Northern America, Latin America and the Carib...","662,334,900(N)/509,254,300(S)"
Asia,4641055,149.6,31915000,104.7,World,31839900.0,4.5,6982.1,,,...,19.1,268302.0,62.0,,85.8/59.8,42.7/46.0,0.24,Asia,"[Central Asia, Eastern Asia, South-eastern Asi...",5351916300
Europe,747636,33.8,23049000,93.4,World,21908700.0,2.0,29278.7,,,...,45.9,104101.0,144.0,65.6,,81.1/53.3,0.61,Europe,"[Eastern Europe, Northern Europe, Southern Eur...",700676200
Oceania,42678,5.0,8564000,100.2,World,1711770.0,2.0,41469.2,,,...,20.4,17969.0,163.0,33.7,96.1/...,66.7/23.5,6.98,Oceania,"[Australia and New Zealand, Melanesia, Microne...",49423700


In [160]:
continents_df_final.to_csv('continents_df_final.csv')