In [24]:
from bs4 import BeautifulSoup
import requests
import time, os
import pandas as pd
import pandas as pd


from selenium import webdriver
from selenium.webdriver.common.keys import Keys

## Set up Soup

In [25]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [26]:
base_url = 'https://ourworldindata.org/countries'
driver = webdriver.Chrome(chromedriver)
driver.get(base_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [27]:
main = soup.find('main')
rows = [row for row in main.find_all('li')]
country_paths = [x.findChildren()[1]['href'] for x in rows]

## Get all country links

In [28]:
all_country_links=[]
for x in country_paths:
    base2=base_url.replace('/countries', '')
    country_url = (base2+x)
    all_country_links.append(country_url)

## Get all indicators 

In [30]:
def get_indicators(soup, field_name):
    obj = soup.find(text=field_name)
    if not obj: 
        return None
    
    next_element = obj.findNext()
    if next_element:
        return next_element.text 
    else:
        return None

## Get indicators function

In [31]:
def get_indicators_dict(link):
    '''
    From world data link stub, request country html, parse with BeautifulSoup, and
    collect 
        - pop density
        - gdp
    Return information as a dictionary.
    '''
  
    #request html and parse
    driver2 = webdriver.Chrome(chromedriver)
    driver2.get(link)
    soup = BeautifulSoup(driver2.page_source, 'html.parser')
    
    headers=['Country','Consumption-based CO2 emissions', 'Annual greenhouse gas emissions', 'Population density', 'Population (Gapminder & UN (1800 to 2019))', 'GDP per capita', 'Life expectancy', 'Female employment-to-population ratio', 'Unemployment rate', 'Daily supply of calories', 'Share of the population living in urban areas', 'Share of the population using the Internet', 'Share of migrants among the population', 'International tourism: Number of arrivals', 'Beef and buffalo meat consumption per person', 'Beef production', 'Share of land area used for agriculture', 'Forest area (% of land area)', 'Share of important terrerstrial biodiversity sites that are protected', 'Fossil-fuel subsidies per capita', 'Death rates from air pollution', 'Direct disaster economic loss', 'Wine consumption per person']
    
    #get country name
    country_name = soup.find('img', class_ ="flag").findNext().text
    
    #get indicators
 
    raw_consumption_based_co2 = get_indicators(soup, 'Consumption-based CO2 emissions')
    
    raw_annual_ggas = get_indicators(soup,'Annual greenhouse gas emissions')
    
    raw_pop_den = get_indicators(soup,'Population density')
    
    raw_pop = get_indicators(soup,'Population (Gapminder & UN (1800 to 2019))')
    
    raw_gdp_percap = get_indicators(soup,'GDP per capita')
    
    raw_life = get_indicators(soup,'Life expectancy')
    
    raw_f_employ = get_indicators(soup,'Female employment-to-population ratio')
    
    raw_unemploy = get_indicators(soup,'Unemployment rate')
    
    raw_daily_cal = get_indicators(soup,'Daily supply of calories')
    
    raw_share_pop_urban = get_indicators(soup,'Share of the population living in urban areas')
    
    raw_share_pop_int = get_indicators(soup,'Share of the population using the Internet')
    
    raw_share_mig_pop = get_indicators(soup,'Share of migrants among the population')
    
    raw_intern_tourism = get_indicators(soup,'International tourism: Number of arrivals')
    
    raw_beef_consum_pp = get_indicators(soup,'Beef and buffalo meat consumption per person')
    
    raw_beef_prod = get_indicators(soup,'Beef production')
    
    raw_share_land_agr = get_indicators(soup,'Share of land area used for agriculture')
    
    raw_forest_pc = get_indicators(soup,'Forest area (% of land area)')
    
    raw_share_sites_protect = get_indicators(soup,'Share of important terrerstrial biodiversity sites that are protected')
    
    raw_fossfuel_subs = get_indicators(soup,'Fossil-fuel subsidies per capita')
    
    raw_death_rate_air_poll = get_indicators(soup,'Death rates from air pollution')
    
    raw_disaster_econ_loss = get_indicators(soup,'Direct disaster economic loss')

    raw_wine_consum_pp = get_indicators(soup,'Wine consumption per person')


    world_data=[]
    
    #create country dictionary and return
    world_dict = dict(zip(headers, [country_name, raw_consumption_based_co2, raw_annual_ggas, raw_pop_den, raw_pop, raw_gdp_percap, raw_life, raw_f_employ, raw_unemploy, raw_daily_cal, raw_share_pop_urban, raw_share_pop_int, raw_share_mig_pop, raw_intern_tourism, raw_beef_consum_pp, raw_beef_prod, raw_share_land_agr, raw_forest_pc, raw_share_sites_protect, raw_fossfuel_subs, raw_death_rate_air_poll, raw_disaster_econ_loss, raw_wine_consum_pp]))
    return world_dict

## Create list of all indicators

In [32]:
country_info_list = []

for link in all_country_links:
    country_info_list.append(get_indicators_dict(link))

In [33]:
country_info_list

[{'Country': 'Aruba',
  'Consumption-based CO2 emissions': None,
  'Annual greenhouse gas emissions': None,
  'Population density': '619.64 (2015)',
  'Population (Gapminder & UN (1800 to 2019))': '106,000 (2019)',
  'GDP per capita': None,
  'Life expectancy': '76.3 years (2019)',
  'Female employment-to-population ratio': '53.4 (2011)',
  'Unemployment rate': None,
  'Daily supply of calories': None,
  'Share of the population living in urban areas': '43.55% (2019)',
  'Share of the population using the Internet': '93.54% (2016)',
  'Share of migrants among the population': '34.76% (2015)',
  'International tourism: Number of arrivals': '1.1 million (2016)',
  'Beef and buffalo meat consumption per person': None,
  'Beef production': None,
  'Share of land area used for agriculture': '11.11% (2015)',
  'Forest area (% of land area)': '2.33% (2015)',
  'Share of important terrerstrial biodiversity sites that are protected': '31.9% (2018)',
  'Fossil-fuel subsidies per capita': None,
 

## Set up datafram

In [35]:
g_countries = pd.DataFrame(country_info_list)  #convert list of dict to df
g_countries.set_index('Country', inplace=True)

g_countries

Unnamed: 0_level_0,Consumption-based CO2 emissions,Annual greenhouse gas emissions,Population density,Population (Gapminder & UN (1800 to 2019)),GDP per capita,Life expectancy,Female employment-to-population ratio,Unemployment rate,Daily supply of calories,Share of the population living in urban areas,...,International tourism: Number of arrivals,Beef and buffalo meat consumption per person,Beef production,Share of land area used for agriculture,Forest area (% of land area),Share of important terrerstrial biodiversity sites that are protected,Fossil-fuel subsidies per capita,Death rates from air pollution,Direct disaster economic loss,Wine consumption per person
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aruba,,,619.64 (2015),"106,000 (2019)",,76.3 years (2019),53.4 (2011),,,43.55% (2019),...,1.1 million (2016),,,11.11% (2015),2.33% (2015),31.9% (2018),,,,
Afghanistan,,"18,168.86 t (2012)",53.69 (2015),38.04 million (2019),"$1,929 (2016)",64.8 years (2019),15.48 (2012),8.84% (2017),"2,000 kcal (2017)",25.75% (2019),...,,4.59 (2013),"129,691 t (2018)",58.07% (2015),2.07% (2015),6.14% (2018),$0.83 (2015),183.9 (2017),$52.18 million (2017),0 litres (2013)
Angola,,"41,657.16 t (2012)",17.33 (2015),31.82 million (2019),"$8,397 (2016)",61.1 years (2019),69.13 (2011),8.17% (2017),"2,270 kcal (2017)",66.18% (2019),...,"397,000 (2016)",8.42 (2013),"104,761 t (2018)",47.48% (2015),46.41% (2015),28.37% (2018),$38.37 (2015),95.2 (2017),$69.24 million (2018),1.32 litres (2014)
Anguilla,,,135.32 (2015),"15,000 (2019)",,81.9 years (2019),,,,100% (2019),...,,,,,,0.15% (2018),,,,
Albania,5.65 million t (2017),"8,898.64 t (2012)",114.91 (2015),2.88 million (2019),"$11,285 (2016)",78.6 years (2019),39.15 (2015),13.87% (2017),"3,400 kcal (2017)",61.23% (2019),...,4.07 million (2016),22.5 (2013),"38,544 t (2018)",42.86% (2015),28.16% (2015),76.11% (2018),$21.74 (2015),40.5 (2017),$5.33 million (2018),1.14 litres (2014)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Samoa,,356.09 t (2012),62.6 (2015),"197,000 (2019)",,73.3 years (2019),21.89 (2014),8.21% (2017),"2,974 kcal (2017)",18.06% (2019),...,"134,000 (2016)",10.38 (2013),"1,933 t (2018)",12.37% (2015),60.42% (2015),37.08% (2018),,76.8 (2017),,0.09 litres (2011)
Yemen,,"40,924.63 t (2012)",66.38 (2015),29.16 million (2019),"$2,199 (2016)",66.1 years (2019),4.46 (2014),14.04% (2017),"2,063 kcal (2017)",37.27% (2019),...,"366,700 (2015)",4.75 (2013),"117,617 t (2018)",44.6% (2015),1.04% (2015),31.08% (2018),$3.87 (2015),96.5 (2017),,0 litres (2013)
South Africa,340.87 million t (2017),,42.38 (2015),58.56 million (2019),"$11,949 (2016)",64.1 years (2019),34.42 (2017),27.33% (2017),"2,994 kcal (2017)",66.86% (2019),...,10.04 million (2016),18.03 (2013),1 million t (2018),79.83% (2015),7.62% (2015),30.7% (2018),$45.81 (2015),57.9 (2017),$33.29 million (2016),1.35 litres (2014)
Zambia,9.66 million t (2017),"320,254.22 t (2012)",20.15 (2015),17.86 million (2019),"$3,538 (2016)",63.9 years (2019),64.28% (2017),7.79% (2017),"2,013 kcal (2017)",44.07% (2019),...,"956,000 (2016)",4.76 (2013),"187,197 t (2018)",32.06% (2015),65.42% (2015),48.36% (2018),$132.74 (2015),98.9 (2017),$7.84 million (2016),0.07 litres (2013)


## Place into pickle

In [36]:
filename = 'country_pic'
outfile = open(filename,'wb')

In [37]:
pickle.dump(g_countries,outfile)
outfile.close()