# Scrape Wikipedia for GDP per capita PPP

In [117]:
import requests
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import pandas as pd

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)_per_capita'

In [15]:
tables = pd.read_html(url, header=0)
print(len(tables))

12


In [17]:
cia_gdp_ppp_capita = tables[4]
cia_gdp_ppp_capita.head()

Unnamed: 0,Rank,Country/Territory,Int$,Year
0,1,Liechtenstein,139100.0,2009 est.
1,2,Qatar,124900.0,2017 est.
2,3,Monaco,115700.0,2015 est.
3,—,Macau,114400.0,2017 est.
4,4,Luxembourg,109100.0,2017 est.


In [18]:
url_country_codes = 'https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes'

In [19]:
tables_country_codes = pd.read_html(url_country_codes, header=0)
print(len(tables_country_codes))

6


In [26]:
tables_country_codes[0].head(10)

Unnamed: 0,country_name,official_state_name,sovereignty,alpha-2-code,alpha-3-code,numeric_code,subdivision_code_links,internet_tld
0,Country name[9],Alpha-2 code[9],Alpha-3 code[9],Numeric code[9],Subdivision code links[3],,,
1,Afghanistan,The Islamic Republic of Afghanistan,UN member state,.mw-parser-output .monospaced{font-family:mono...,AFG,4.0,ISO 3166-2:AF,.af
2,Åland Islands,Åland,Finland,AX,ALA,248.0,ISO 3166-2:AX,.ax
3,Albania,The Republic of Albania,UN member state,AL,ALB,8.0,ISO 3166-2:AL,.al
4,Algeria,The People's Democratic Republic of Algeria,UN member state,DZ,DZA,12.0,ISO 3166-2:DZ,.dz
5,American Samoa,The Territory of American Samoa,United States,AS,ASM,16.0,ISO 3166-2:AS,.as
6,Andorra,The Principality of Andorra,UN member state,AD,AND,20.0,ISO 3166-2:AD,.ad
7,Angola,The Republic of Angola,UN member state,AO,AGO,24.0,ISO 3166-2:AO,.ao
8,Anguilla,Anguilla,United Kingdom,AI,AIA,660.0,ISO 3166-2:AI,.ai
9,Antarctica [a],All land and ice shelves south of the 60th par...,Antarctic Treaty,AQ,ATA,10.0,ISO 3166-2:AQ,.aq


In [24]:
tables_country_codes[0].columns = ['country_name', 'official_state_name', 'sovereignty', 'alpha-2-code',
                                  'alpha-3-code', 'numeric_code', 'subdivision_code_links', 'internet_tld']

In [49]:
cc_table = tables_country_codes[0].dropna()

In [50]:
cc_table.shape

(248, 8)

In [51]:
tables_country_codes[0].shape

(280, 8)

In [52]:
cc_table.head()

Unnamed: 0,country_name,official_state_name,sovereignty,alpha-2-code,alpha-3-code,numeric_code,subdivision_code_links,internet_tld
1,Afghanistan,The Islamic Republic of Afghanistan,UN member state,.mw-parser-output .monospaced{font-family:mono...,AFG,4.0,ISO 3166-2:AF,.af
2,Åland Islands,Åland,Finland,AX,ALA,248.0,ISO 3166-2:AX,.ax
3,Albania,The Republic of Albania,UN member state,AL,ALB,8.0,ISO 3166-2:AL,.al
4,Algeria,The People's Democratic Republic of Algeria,UN member state,DZ,DZA,12.0,ISO 3166-2:DZ,.dz
5,American Samoa,The Territory of American Samoa,United States,AS,ASM,16.0,ISO 3166-2:AS,.as


In [57]:
cc_table.at[1, 'alpha-2-code'] = 'AG'
cc_table.loc[cc_table['country_name']=='Afghanistan']

Unnamed: 0,country_name,official_state_name,sovereignty,alpha-2-code,alpha-3-code,numeric_code,subdivision_code_links,internet_tld
1,Afghanistan,The Islamic Republic of Afghanistan,UN member state,AG,AFG,4.0,ISO 3166-2:AF,.af


In [58]:
cia_gdp_ppp_capita.shape

(229, 4)

In [None]:
cia_gdp_ppp_capita.columns = ['rank', 'country_name', 'intl_dollars', 'year']

In [65]:
name_code_gdp = cia_gdp_ppp_capita.merge(cc_table, 
                         how='left', 
                         left_on='country_name', 
                         right_on='country_name')[['country_name', 'alpha-2-code', 'intl_dollars']]

In [72]:
name_code_gdp['alpha-2-code'].isnull().values.sum()

55

Since there are 55 of these observations that don't have any country code associated with it, the computer must follow the links to each of the wikipedia websites to get the link associated with the country. On each link, there is the country code associated with it. 

In [116]:
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")

In [101]:
cia_table = soup.find(string="Liechtenstein").find_parent('table')

In [158]:
cia_table_list = []
for row in cia_table.find_all('tr'):
    row_list = []
    for item in row.find_all('td'):
        row_list.append(item.text.strip().replace(u'\xa0', u' '))
    cia_table_list.append(row_list)
#remove the first row because it was 'th' instead of 'tr'
cia_table_list = cia_table_list[1:]

In [124]:
cia_table_links = [each['href'] for each in cia_table.find_all('a')]
cia_table_links

['/wiki/Liechtenstein',
 '/wiki/Qatar',
 '/wiki/Monaco',
 '/wiki/Macau',
 '/wiki/Luxembourg',
 '/wiki/Falkland_Islands',
 '/wiki/Singapore',
 '/wiki/Bermuda',
 '/wiki/Isle_of_Man',
 '/wiki/Brunei',
 '/wiki/Republic_of_Ireland',
 '/wiki/Norway',
 '/wiki/Kuwait',
 '/wiki/United_Arab_Emirates',
 '/wiki/Sint_Maarten',
 '/wiki/Gibraltar',
 '/wiki/Switzerland',
 '/wiki/Hong_Kong',
 '/wiki/San_Marino',
 '/wiki/United_States',
 '/wiki/Saudi_Arabia',
 '/wiki/Netherlands',
 '/wiki/Guernsey',
 '/wiki/Iceland',
 '/wiki/Bahrain',
 '/wiki/Sweden',
 '/wiki/Germany',
 '/wiki/Australia',
 '/wiki/Andorra',
 '/wiki/Taiwan',
 '/wiki/Denmark',
 '/wiki/Jersey',
 '/wiki/Austria',
 '/wiki/Canada',
 '/wiki/Belgium',
 '/wiki/Oman',
 '/wiki/Finland',
 '/wiki/Cayman_Islands',
 '/wiki/France',
 '/wiki/United_Kingdom',
 '/wiki/Japan',
 '/wiki/Malta',
 '/wiki/British_Virgin_Islands',
 '/wiki/Faroe_Islands',
 '/wiki/South_Korea',
 '/wiki/New_Zealand',
 '/wiki/Spain',
 '/wiki/Italy',
 '/wiki/Puerto_Rico',
 '/wiki/Gree

In [159]:
for index, row in enumerate(cia_table_list):
    row.append(cia_table_links[index])
print(cia_table_list)

[['1', 'Liechtenstein', '139,100', '2009 est.', '/wiki/Liechtenstein'], ['2', 'Qatar', '124,900', '2017 est.', '/wiki/Qatar'], ['3', 'Monaco', '115,700', '2015 est.', '/wiki/Monaco'], ['—', 'Macau', '114,400', '2017 est.', '/wiki/Macau'], ['4', 'Luxembourg', '109,100', '2017 est.', '/wiki/Luxembourg'], ['—', 'Falkland Islands', '96,200', '2012 est.', '/wiki/Falkland_Islands'], ['5', 'Singapore', '90,500', '2017 est.', '/wiki/Singapore'], ['—', 'Bermuda', '85,700', '2013 est.', '/wiki/Bermuda'], ['—', 'Isle of Man', '84,600', '2014 est.', '/wiki/Isle_of_Man'], ['6', 'Brunei', '76,700', '2017 est.', '/wiki/Brunei'], ['7', 'Ireland', '72,600', '2017 est.', '/wiki/Republic_of_Ireland'], ['8', 'Norway', '70,600', '2017 est.', '/wiki/Norway'], ['9', 'Kuwait', '69,700', '2017 est.', '/wiki/Kuwait'], ['10', 'United Arab Emirates', '68,200', '2017 est.', '/wiki/United_Arab_Emirates'], ['—', 'Sint Maarten', '66,800', '2014 est.', '/wiki/Sint_Maarten'], ['—', 'Gibraltar', '61,700', '2014 est.', '

The next two cells test out the scraping of each country web page:

In [127]:
endings = '/wiki/British_Virgin_Islands'
beginning = 'https://en.wikipedia.org'
response_country = requests.get(beginning + endings)
page_country = response_country.text
soup_country = BeautifulSoup(page_country, "lxml")

In [142]:
print(soup_country.find(string='ISO 3166 code').next_element.text.strip())

VG


In [148]:
import time
def get_country_code(link):
    time.sleep(2)
    response = requests.get(beginning + link)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    find = soup.find(string='ISO 3166 code')
    if find:
        return find.next_element.text.strip()
    else:
        return None
        print(link)

In [149]:
two_letter_country_codes = [get_country_code(link) for link in cia_table_links]
two_letter_country_codes

['LI',
 'QA',
 'MC',
 'MO',
 'LU',
 'FK',
 'SG',
 'BM',
 'IM',
 'BN',
 'IE',
 'NO',
 'KW',
 'AE',
 'SX',
 'GI',
 'CH',
 'HK',
 'SM',
 'US',
 'SA',
 'NL',
 'GG',
 'IS',
 'BH',
 'SE',
 'DE',
 'AU',
 'AD',
 'TW',
 'DK',
 'JE',
 'AT',
 None,
 'BE',
 'OM',
 'FI',
 'KY',
 'FR',
 'GB',
 'JP',
 'MT',
 'VG',
 'FO',
 'KR',
 'NZ',
 'ES',
 'IT',
 'PR',
 'GL',
 'CY',
 'IL',
 'VI',
 'CZ',
 'GQ',
 'PM',
 'SI',
 'SK',
 'LT',
 'EE',
 'TT',
 'NC',
 'GU',
 'PT',
 'PL',
 'TC',
 'SC',
 'HU',
 'MY',
 'RU',
 'GR',
 'LV',
 'KN',
 'TR',
 'AG',
 'KZ',
 'AW',
 'BS',
 'CL',
 'PA',
 'HR',
 'RO',
 'UY',
 'BG',
 'MU',
 'AR',
 'IR',
 'MX',
 'LB',
 'GA',
 'MF',
 'MV',
 'TM',
 'BY',
 'BW',
 'TH',
 'BB',
 'AZ',
 'ME',
 'CR',
 'IQ',
 'PF',
 'DO',
 'PW',
 'CN',
 'BR',
 'RS',
 'MK',
 'DZ',
 'CW',
 'GD',
 'CO',
 'SR',
 'LC',
 'ZA',
 'MP',
 'PE',
 'EG',
 'AS',
 'LK',
 'MN',
 'JO',
 'AL',
 'ID',
 'VE',
 'CK',
 'NR',
 'AI',
 'DM',
 'TN',
 'CU',
 'VC',
 'NA',
 'BA',
 'EC',
 'GE',
 'XK',
 'FJ',
 'SZ',
 'PY',
 'LY',
 'JM',
 'AM',

In [160]:
for i, code in enumerate(two_letter_country_codes):
    if not code:
        print(cia_table_links[i])
        if "Canada" in cia_table_links[i]:
            two_letter_country_codes[i] = 'CA'
        elif 'Guinea-Bissau' in cia_table_links[i]:
            two_letter_country_codes[i] = 'GW'

In [161]:
None in two_letter_country_codes

False

In [162]:
for index, row in enumerate(cia_table_list):
    row.append(two_letter_country_codes[index])
print(cia_table_list)

[['1', 'Liechtenstein', '139,100', '2009 est.', '/wiki/Liechtenstein', 'LI'], ['2', 'Qatar', '124,900', '2017 est.', '/wiki/Qatar', 'QA'], ['3', 'Monaco', '115,700', '2015 est.', '/wiki/Monaco', 'MC'], ['—', 'Macau', '114,400', '2017 est.', '/wiki/Macau', 'MO'], ['4', 'Luxembourg', '109,100', '2017 est.', '/wiki/Luxembourg', 'LU'], ['—', 'Falkland Islands', '96,200', '2012 est.', '/wiki/Falkland_Islands', 'FK'], ['5', 'Singapore', '90,500', '2017 est.', '/wiki/Singapore', 'SG'], ['—', 'Bermuda', '85,700', '2013 est.', '/wiki/Bermuda', 'BM'], ['—', 'Isle of Man', '84,600', '2014 est.', '/wiki/Isle_of_Man', 'IM'], ['6', 'Brunei', '76,700', '2017 est.', '/wiki/Brunei', 'BN'], ['7', 'Ireland', '72,600', '2017 est.', '/wiki/Republic_of_Ireland', 'IE'], ['8', 'Norway', '70,600', '2017 est.', '/wiki/Norway', 'NO'], ['9', 'Kuwait', '69,700', '2017 est.', '/wiki/Kuwait', 'KW'], ['10', 'United Arab Emirates', '68,200', '2017 est.', '/wiki/United_Arab_Emirates', 'AE'], ['—', 'Sint Maarten', '66,8

In [163]:
for row in cia_table_list:
    if row[2] == 'N/A':
        row[2] = 0
    else:
        row[2] = int(row[2].replace(',', ''))

In [164]:
cia_table_list[-1]

['198', 'Somalia', 0, '2017 est.', '/wiki/Somalia', 'SO']

In [165]:
cia_table_df = pd.DataFrame(cia_table_list, columns=['rank', 'country_name', 'gdp_ppp_pc', 
                                                     'est_date', 'url_link_wiki', 'country_code'])

In [171]:
cia_table_df.head()

Unnamed: 0,rank,country_name,gdp_ppp_pc,est_date,url_link_wiki,country_code
0,1,Liechtenstein,139100,2009 est.,/wiki/Liechtenstein,LI
1,2,Qatar,124900,2017 est.,/wiki/Qatar,QA
2,3,Monaco,115700,2015 est.,/wiki/Monaco,MC
3,—,Macau,114400,2017 est.,/wiki/Macau,MO
4,4,Luxembourg,109100,2017 est.,/wiki/Luxembourg,LU


In [174]:
cia_table_df.to_pickle('cia_table_df.pkl')