In [1]:
import os
import re
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
data1 = pd.read_csv('../data/literacy_rate_public.csv')
print(f"Shape of the data is: {data1.shape}")
data1.head()

Shape of the data is: (140, 3)


Unnamed: 0,country,literacyRate,population
0,Afghanistan,43.0,38928346
1,Albania,98.1,2877797
2,Algeria,81.4,43851044
3,Angola,71.1,32866272
4,Australia,99.0,97929


In [3]:
data1.set_index('country')
data1.head()

Unnamed: 0,country,literacyRate,population
0,Afghanistan,43.0,38928346
1,Albania,98.1,2877797
2,Algeria,81.4,43851044
3,Angola,71.1,32866272
4,Australia,99.0,97929


In [4]:
data2 = pd.read_csv('../data/skill_migration_public.csv')
print(f"Shape of the data is: {data2.shape}")
data2.head()

Shape of the data is: (88085, 9)


Unnamed: 0,country_code,country,income_group,world_region,skill_group_id,skill_group,skill,Delta,Year
0,af,Afghanistan,Low income,South Asia,2549,Tech Skills,Information Management,-792,2015
1,af,Afghanistan,Low income,South Asia,2608,Business Skills,Operational Efficiency,-1610,2015
2,af,Afghanistan,Low income,South Asia,3806,Specialized Industry Skills,National Security,-1731,2015
3,af,Afghanistan,Low income,South Asia,50321,Tech Skills,Software Testing,-958,2015
4,af,Afghanistan,Low income,South Asia,1606,Specialized Industry Skills,Navy,-1511,2015


In [5]:
data2.set_index('country')

Unnamed: 0_level_0,country_code,income_group,world_region,skill_group_id,skill_group,skill,Delta,Year
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,af,Low income,South Asia,2549,Tech Skills,Information Management,-792,2015
Afghanistan,af,Low income,South Asia,2608,Business Skills,Operational Efficiency,-1610,2015
Afghanistan,af,Low income,South Asia,3806,Specialized Industry Skills,National Security,-1731,2015
Afghanistan,af,Low income,South Asia,50321,Tech Skills,Software Testing,-958,2015
Afghanistan,af,Low income,South Asia,1606,Specialized Industry Skills,Navy,-1511,2015
...,...,...,...,...,...,...,...,...
Zimbabwe,zw,Low income,Sub-Saharan Africa,12666,Specialized Industry Skills,Teaching,-94,2019
Zimbabwe,zw,Low income,Sub-Saharan Africa,1235,Specialized Industry Skills,Mining,-93,2019
Zimbabwe,zw,Low income,Sub-Saharan Africa,43756,Specialized Industry Skills,Personal Coaching,-82,2019
Zimbabwe,zw,Low income,Sub-Saharan Africa,1724,Specialized Industry Skills,Public Health,-32,2019


In [6]:
data3 = pd.read_csv('../data/gdp_public.csv')
print(f"Shape of the data is: {data2.shape}")
data3.head()

Shape of the data is: (88085, 9)


Unnamed: 0,rank,country,gdpPerCapita
0,1,United States,67063.2695
1,2,China,10746.7828
2,3,Japan,43450.1405
3,4,Germany,49617.145
4,5,India,2360.6592


In [7]:
combined_data1 = pd.merge(data2, data1, on='country', how='inner')
combined_data1

Unnamed: 0,country_code,country,income_group,world_region,skill_group_id,skill_group,skill,Delta,Year,literacyRate,population
0,af,Afghanistan,Low income,South Asia,2549,Tech Skills,Information Management,-792,2015,43.0,38928346
1,af,Afghanistan,Low income,South Asia,2608,Business Skills,Operational Efficiency,-1610,2015,43.0,38928346
2,af,Afghanistan,Low income,South Asia,3806,Specialized Industry Skills,National Security,-1731,2015,43.0,38928346
3,af,Afghanistan,Low income,South Asia,50321,Tech Skills,Software Testing,-958,2015,43.0,38928346
4,af,Afghanistan,Low income,South Asia,1606,Specialized Industry Skills,Navy,-1511,2015,43.0,38928346
...,...,...,...,...,...,...,...,...,...,...,...
86560,zw,Zimbabwe,Low income,Sub-Saharan Africa,12666,Specialized Industry Skills,Teaching,-94,2019,86.5,14862924
86561,zw,Zimbabwe,Low income,Sub-Saharan Africa,1235,Specialized Industry Skills,Mining,-93,2019,86.5,14862924
86562,zw,Zimbabwe,Low income,Sub-Saharan Africa,43756,Specialized Industry Skills,Personal Coaching,-82,2019,86.5,14862924
86563,zw,Zimbabwe,Low income,Sub-Saharan Africa,1724,Specialized Industry Skills,Public Health,-32,2019,86.5,14862924


In [8]:
combined_data = pd.merge(combined_data1, data3, on='country', how='inner')
combined_data

Unnamed: 0,country_code,country,income_group,world_region,skill_group_id,skill_group,skill,Delta,Year,literacyRate,population,rank,gdpPerCapita
0,af,Afghanistan,Low income,South Asia,2549,Tech Skills,Information Management,-792,2015,43.0,38928346,116,531.2838
1,af,Afghanistan,Low income,South Asia,2608,Business Skills,Operational Efficiency,-1610,2015,43.0,38928346,116,531.2838
2,af,Afghanistan,Low income,South Asia,3806,Specialized Industry Skills,National Security,-1731,2015,43.0,38928346,116,531.2838
3,af,Afghanistan,Low income,South Asia,50321,Tech Skills,Software Testing,-958,2015,43.0,38928346,116,531.2838
4,af,Afghanistan,Low income,South Asia,1606,Specialized Industry Skills,Navy,-1511,2015,43.0,38928346,116,531.2838
...,...,...,...,...,...,...,...,...,...,...,...,...,...
84195,zw,Zimbabwe,Low income,Sub-Saharan Africa,12666,Specialized Industry Skills,Teaching,-94,2019,86.5,14862924,108,1736.6704
84196,zw,Zimbabwe,Low income,Sub-Saharan Africa,1235,Specialized Industry Skills,Mining,-93,2019,86.5,14862924,108,1736.6704
84197,zw,Zimbabwe,Low income,Sub-Saharan Africa,43756,Specialized Industry Skills,Personal Coaching,-82,2019,86.5,14862924,108,1736.6704
84198,zw,Zimbabwe,Low income,Sub-Saharan Africa,1724,Specialized Industry Skills,Public Health,-32,2019,86.5,14862924,108,1736.6704


In [9]:
combined_data.dtypes

country_code       object
country            object
income_group       object
world_region       object
skill_group_id      int64
skill_group        object
skill              object
Delta               int64
Year                int64
literacyRate      float64
population          int64
rank                int64
gdpPerCapita      float64
dtype: object

In [10]:
combined_data['drainOrGain'] = combined_data['Delta']* combined_data['population']/10000

In [11]:
combined_data['drainOrGain'] = combined_data['drainOrGain'].astype(int) 

In [12]:
combined_data.head()

Unnamed: 0,country_code,country,income_group,world_region,skill_group_id,skill_group,skill,Delta,Year,literacyRate,population,rank,gdpPerCapita,drainOrGain
0,af,Afghanistan,Low income,South Asia,2549,Tech Skills,Information Management,-792,2015,43.0,38928346,116,531.2838,-3083125
1,af,Afghanistan,Low income,South Asia,2608,Business Skills,Operational Efficiency,-1610,2015,43.0,38928346,116,531.2838,-6267463
2,af,Afghanistan,Low income,South Asia,3806,Specialized Industry Skills,National Security,-1731,2015,43.0,38928346,116,531.2838,-6738496
3,af,Afghanistan,Low income,South Asia,50321,Tech Skills,Software Testing,-958,2015,43.0,38928346,116,531.2838,-3729335
4,af,Afghanistan,Low income,South Asia,1606,Specialized Industry Skills,Navy,-1511,2015,43.0,38928346,116,531.2838,-5882073


In [13]:
combined_data.dtypes

country_code       object
country            object
income_group       object
world_region       object
skill_group_id      int64
skill_group        object
skill              object
Delta               int64
Year                int64
literacyRate      float64
population          int64
rank                int64
gdpPerCapita      float64
drainOrGain         int32
dtype: object

In [14]:
combined_data.to_csv('../output/final-data.csv')