In this notebook I aim to scrape a list of countries and what continent they are in from [worldatlas.com](https://www.worldatlas.com/cntycont.htm)

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests, json, re

In [2]:
url = 'https://www.worldatlas.com/cntycont.htm'

page = requests.get(url).text

soup = BeautifulSoup(page, 'html.parser')

In [3]:
# get the continent name from the header tag.
pattern = r'[A-Z]\. [A-Z]+|[A-Z]+'
clean = lambda header: re.findall(pattern, header.text)[0]

# Continents are enclosed in <h2> tags
headers = soup.find_all('h2')
continent_headers = headers[1:7]
continent_headers

[<h2>AFRICA (54)</h2>,
 <h2>ASIA (44)
 </h2>,
 <h2>
 EUROPE (47)
 </h2>,
 <h2>
 N. AMERICA (23)
 </h2>,
 <h2>
 OCEANIA (14)</h2>,
 <h2>
 S. AMERICA (12)</h2>]

In [4]:
headers[1].find_next_sibling().find_all('li')[1].text

'Angola'

In [5]:
continent_countries = {}

# countries are in <li> tags within a <ul> just after the header with the continent name.
for header in continent_headers:
    continent = clean(header)
    country_tags = header.find_next_sibling().find_all('li')
    # get the name of the country
    countries = [tag.text for tag in country_tags]
    continent_countries[continent] = countries

In [12]:
continents = []
countries = []

for continent, country_list in continent_countries.items():
    num_countries = len(country_list)
    countries.extend(country_list)
    continents.extend([continent]*num_countries)


df = pd.DataFrame()
df['country'] = countries
df['continent'] = continents

# rename the continents.
continent_rename = {'AFRICA': 'Africa', 'ASIA': 'Asia', 'EUROPE': 'Europe', 'N. AMERICA': 'North America',
                   'OCEANIA': 'Oceania', 'S. AMERICA': 'South America'}
df.replace(continent_rename, inplace=True)

df.head()

Unnamed: 0,country,continent
0,Algeria,Africa
1,Angola,Africa
2,Benin,Africa
3,Botswana,Africa
4,Burkina,Africa


In [18]:
df.replace('Zimbabwe\n', 'Zimbabwe', inplace=True)

In [22]:
full_df = pd.read_csv('health_spending_dataset.csv')

bank_countries = set(full_df.country)
atlas_countries = set(df.country)

In [24]:
len(bank_countries - atlas_countries)

50

In [25]:
len(atlas_countries - bank_countries)

27