In [1]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Try to match country names from DOL data (Sweat & Toil) to World Bank data (indicators)
Use DOL countries table as the master list

In [2]:
net_migration = pd.read_csv('data/API_SM.POP.NETM_DS2_en_csv_v2.csv', header=2)
countries = pd.read_csv('data/Countries.csv', index_col=0)

C and S appear to be typos in DOL data and are removed.

In [3]:
countries = countries[(countries.name !='C') & (countries.name !='S')]

In [4]:
net_migration.head()
countries.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,Unnamed: 61
0,Aruba,ABW,Net migration,SM.POP.NETM,,,-4323.0,,,,...,,,,,1253.0,,,,,
1,Afghanistan,AFG,Net migration,SM.POP.NETM,,,-20000.0,,,,...,,,,,448007.0,,,,,
2,Angola,AGO,Net migration,SM.POP.NETM,,,-135000.0,,,,...,,,,,87322.0,,,,,
3,Albania,ALB,Net migration,SM.POP.NETM,,,-99.0,,,,...,,,,,-93425.0,,,,,
4,Andorra,AND,Net migration,SM.POP.NETM,,,,,,,...,,,,,,,,,,


Unnamed: 0,id,name,num_territories,region,region_id
0,1,Afghanistan,0,Asia & the Pacific,1
1,10,Bangladesh,0,Asia & the Pacific,1
2,13,Bhutan,0,Asia & the Pacific,1
3,21,Burma,0,Asia & the Pacific,1
4,24,Cambodia,0,Asia & the Pacific,1


Join datasets to see which countries in DOL data do not match with World Bank data

In [5]:
joined =countries.merge(net_migration[['Country Name', 'Indicator Name', '2012']], how='left', left_on='name', right_on='Country Name')
joined.shape

(146, 8)

In [6]:
mismatch = joined[pd.isnull(joined['2012'])]
mismatch.shape
mismatch

(34, 8)

Unnamed: 0,id,name,num_territories,region,region_id,Country Name,Indicator Name,2012
3,21,Burma,0,Asia & the Pacific,1,,,
6,30,Christmas Island,0,Asia & the Pacific,1,,,
7,31,Cocos (Keeling) Islands,0,Asia & the Pacific,1,,,
8,36,Cook Islands,0,Asia & the Pacific,1,,,
10,59,Heard and McDonald Islands,0,Asia & the Pacific,1,,,
20,95,Niue,0,Asia & the Pacific,1,,,
21,96,Norfolk Island,0,Asia & the Pacific,1,,,
22,97,North Korea,0,Asia & the Pacific,1,,,
32,130,Tokelau,0,Asia & the Pacific,1,,,
35,135,Tuvalu,0,Asia & the Pacific,1,Tuvalu,Net migration,


Create name_normalized column in countries table, containing names that match World Bank data.   

Manually matching those that can be matched. Some countries are not represented in World Bank data or have missing values in specified dataset.

In [7]:
countries.shape
countries['name_normalized'] = countries['name'] 
countries.loc[countries.name == 'Burma','name_normalized'] = 'Myanmar'
countries.loc[countries.name == 'North Korea','name_normalized'] = "Korea, Dem. People’s Rep."
countries.loc[countries.name == 'Macedonia','name_normalized'] = 'Macedonia, FYR'
countries.loc[countries.name == 'Moldova, Republic of','name_normalized'] = 'Moldova'
countries.loc[countries.name == 'Russia','name_normalized'] = 'Russian Federation'
countries.loc[countries.name == 'Egypt','name_normalized'] = 'Egypt, Arab Rep.'
countries.loc[countries.name == 'Iran','name_normalized'] = 'Iran, Islamic Rep.'
countries.loc[countries.name == 'West Bank and the Gaza Strip','name_normalized'] = 'West Bank and Gaza'
countries.loc[countries.name == 'Yemen','name_normalized'] = 'Yemen, Rep.'
countries.loc[countries.name == 'Congo, Democratic Republic of the','name_normalized'] = 'Congo, Dem. Rep.'
countries.loc[countries.name == 'Congo, Republic of the','name_normalized'] = 'Congo, Rep.'
countries.loc[countries.name == 'Gambia','name_normalized'] = 'Gambia, The'
countries.loc[countries.name == 'Saint Lucia','name_normalized'] = 'St. Lucia'
countries.loc[countries.name == 'Saint Vincent and the Grenadines','name_normalized'] = 'St. Vincent and the Grenadines'
countries.loc[countries.name == 'Venezuela','name_normalized'] = 'Venezuela, RB'

countries.shape

(146, 5)

(146, 6)

Join again to see if we get expected results.

In [8]:
joined2 = countries.merge(net_migration[['Country Name', 'Indicator Name', '2012']], how='inner', left_on='name_normalized', right_on='Country Name')
joined2.shape
mismatch2 = joined2[pd.isnull(joined2['2012'])]
mismatch2.shape
mismatch2

(131, 9)

(4, 9)

Unnamed: 0,id,name,num_territories,region,region_id,name_normalized,Country Name,Indicator Name,2012
28,135,Tuvalu,0,Asia & the Pacific,1,Tuvalu,Tuvalu,Net migration,
36,19,British Virgin Islands,0,Europe & Eurasia,2,British Virgin Islands,British Virgin Islands,Net migration,
38,70,Kosovo,0,Europe & Eurasia,2,Kosovo,Kosovo,Net migration,
111,40,Dominica,0,Latin America & the Caribbean,5,Dominica,Dominica,Net migration,


In [9]:
countries.to_csv('data/Countries_normalized.csv')