In [66]:
#load libraries
import pandas as pd
import os as os
import numpy as np
import re
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

## Crawled database from https://www.henleypassportindex.com

In [2]:
df_visa_free = pd.read_csv('henley_visa_free_11_01_2019.csv', encoding = "ISO-8859-1")

In [3]:
df_visa_free.rename(columns={'passport': 'passportfrom_alpha_2code'}, inplace=True)
df_visa_free.rename(columns={'to': 'countryto_alpha_2code'}, inplace=True)

In [4]:
df_visa_free.shape

(45173, 3)

In [5]:
df_visa_free['passportfrom_alpha_2code'].nunique()

198

In [6]:
df_visa_free['countryto_alpha_2code'].nunique()

226

In [7]:
df_visa_free.head(5)

Unnamed: 0,passportfrom_alpha_2code,countryto_alpha_2code,visafree
0,AM,AF,0
1,AM,AL,1
2,AM,DZ,0
3,AM,AS,0
4,AM,AD,0


## List of world countries with codes
### Source Wikipedia https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
### Scrapped by Juanu https://www.kaggle.com/juanumusic/countries-iso-codes

In [8]:
df_iso = pd.read_csv('wikipedia-iso-country-codes.csv', encoding = "ISO-8859-1")

In [9]:
df_iso.rename(columns={'Alpha-2 code': 'country_alpha_2code'}, inplace=True)
df_iso.rename(columns={'Alpha-3 code': 'country_alpha_3code'}, inplace=True)
df_iso.rename(columns={'Numeric code': 'country_numeric_code'}, inplace=True)

In [10]:
df_iso.shape

(246, 4)

In [11]:
df_iso.sample(5)

Unnamed: 0,Country name,country_alpha_2code,country_alpha_3code,country_numeric_code
42,Chad,TD,TCD,148
64,El Salvador,SV,SLV,222
170,Papua New Guinea,PG,PNG,598
19,Barbados,BB,BRB,52
85,Grenada,GD,GRD,308


## GDP per countries in USD, 2016, from the United Nations
### Source: https://datacatalog.worldbank.org/dataset/gdp-ranking

In [12]:
df_GDP_UN = pd.read_csv('UN_GDP_2016_millions.csv', encoding = "ISO-8859-1")

In [13]:
df_GDP_UN.rename(columns={'countryCode': 'country_numeric_code'}, inplace=True)
df_GDP_UN.rename(columns={'countryName': 'UN_GDP_countryName'}, inplace=True)
df_GDP_UN.rename(columns={'passport_numeric_code': 'countryto_passport_numeric_code'}, inplace=True)

In [14]:
df_GDP_UN = df_GDP_UN.drop(['UN_GDP_countryName'], axis=1)

In [15]:
df_GDP_UN['UN_GDP_currentPrices2016'] = df_GDP_UN['UN_GDP_currentPrices2016'].str.replace(',', '')
df_GDP_UN['UN_GDP_currentPrices2016'] = df_GDP_UN['UN_GDP_currentPrices2016'].astype(int)

In [16]:
df_GDP_UN.shape

(212, 2)

In [17]:
df_GDP_UN.sample(5)

Unnamed: 0,country_numeric_code,UN_GDP_currentPrices2016
182,752,514476
95,388,14057
13,44,11262
186,764,407026
59,214,71584


## Population per countries in millions, 2015, from the United Nations
### Source: http://data.un.org/_Docs/SYB/CSV/SYB61_T02_Population,%20Surface%20Area%20and%20Density.csv

In [18]:
df_population_UN = pd.read_csv('UN_population_2015_millions.csv', encoding = "ISO-8859-1")

In [19]:
df_population_UN.rename(columns={'countryCode': 'country_numeric_code'}, inplace=True)
df_population_UN.rename(columns={'countryName': 'UN_population_countryName'}, inplace=True)

In [20]:
df_population_UN['UN_population_Value_2015_millions'] = df_population_UN['UN_population_Value_2015_millions'].str.replace(',', '')
df_population_UN['UN_population_Value_2015_millions'] = df_population_UN['UN_population_Value_2015_millions'].astype(float)

In [21]:
df_population_UN.shape

(228, 3)

In [22]:
df_population_UN.sample(5)

Unnamed: 0,country_numeric_code,UN_population_countryName,UN_population_Value_2015_millions
141,524,Nepal,28.66
74,254,French Guiana,0.27
49,188,Costa Rica,4.81
112,428,Latvia,1.99
193,740,Suriname,0.55


## Surface per countries in thousand of km2, from the United Nations
### http://data.un.org/_Docs/SYB/CSV/SYB61_T02_Population,%20Surface%20Area%20and%20Density.csv

In [23]:
df_surface_UN = pd.read_csv('UN_surface_areas.csv', encoding = "ISO-8859-1")

In [24]:
df_surface_UN.rename(columns={'countryCode': 'country_numeric_code'}, inplace=True)
df_surface_UN.rename(columns={'countryName': 'UN_surface_countryName'}, inplace=True)

In [25]:
df_surface_UN['Surface area (thousand km2)'] = df_surface_UN['Surface area (thousand km2)'].str.replace(',', '')
df_surface_UN['Surface area (thousand km2)'] = df_surface_UN['Surface area (thousand km2)'].astype(float)

In [26]:
df_surface_UN.shape

(229, 4)

In [27]:
df_surface_UN.sample(5)

Unnamed: 0,country_numeric_code,UN_surface_countryName,Surface area (thousand km2),Year_Surface area
141,516,Namibia,824.0,2015
84,304,Greenland,2166.0,2015
90,624,Guinea-Bissau,36.0,2015
151,580,Northern Mariana Islands,0.0,2015
107,400,Jordan,89.0,2015


## Intersect df_visa_free and df_iso on 'country to'

In [28]:
# copy df_visa_free, rename the columns from centadata and hauntedhouse
df_visa_free2 = df_visa_free.copy()

In [29]:
df_visa_free2.rename(columns={'countryto_alpha_2code': 'country_alpha_2code'}, inplace=True)

In [30]:
# Intersect on 'countryto_alpha_2code'
df = pd.merge(df_visa_free2, df_iso, on=['country_alpha_2code'], how='left')

In [31]:
df.rename(columns={'country_alpha_2code': 'countryto_alpha_2code'}, inplace=True)
df.rename(columns={'Country name': 'countryto_Country name'}, inplace=True)
df.rename(columns={'country_alpha_3code': 'countryto_country_alpha_3code'}, inplace=True)
df.rename(columns={'country_numeric_code': 'countryto_country_numeric_code'}, inplace=True)

In [32]:
df.sample(10)

Unnamed: 0,passportfrom_alpha_2code,countryto_alpha_2code,visafree,countryto_Country name,countryto_country_alpha_3code,countryto_country_numeric_code
35573,CO,BW,0,Botswana,BWA,72.0
5383,VU,FJ,1,Fiji,FJI,242.0
8040,ZA,AS,0,American Samoa,ASM,16.0
43584,AT,NE,0,Niger,NER,562.0
19472,NZ,GE,1,Georgia,GEO,268.0
25774,MW,IE,0,Ireland,IRL,372.0
38902,KH,GT,0,Guatemala,GTM,320.0
35619,CL,CG,0,Congo,COG,178.0
37777,EE,JO,1,Jordan,JOR,400.0
2001,MC,BF,0,Burkina Faso,BFA,854.0


## Intersect df_visa_free and df_iso on 'passport'

In [33]:
df_iso.rename(columns={'country_alpha_2code': 'passportfrom_alpha_2code'}, inplace=True)

In [34]:
# Intersect on 'passportfrom_alpha_2code'
df = pd.merge(df, df_iso, on=['passportfrom_alpha_2code'], how='right')

In [35]:
df_iso.rename(columns={'passportfrom_alpha_2code': 'country_alpha_2code'}, inplace=True)

In [36]:
df.rename(columns={'Country name': 'passportfrom_Country name'}, inplace=True)

In [37]:
df = df.drop(['country_alpha_3code'], axis=1)
df = df.drop(['country_numeric_code'], axis=1)

## Intersect df and df_GDP_UN

In [38]:
df_GDP_UN2 = df_GDP_UN.copy()
df_GDP_UN2.rename(columns={'country_numeric_code': 'countryto_country_numeric_code'}, inplace=True)

In [39]:
df2 = pd.merge(df, df_GDP_UN2, on=['countryto_country_numeric_code'], how='right')

In [40]:
df2.rename(columns={'UN_GDP_currentPrices2016': 'countryto_UN_GDP_currentPrices2016'}, inplace=True)

In [41]:
df2.sample(10)

Unnamed: 0,passportfrom_alpha_2code,countryto_alpha_2code,visafree,countryto_Country name,countryto_country_alpha_3code,countryto_country_numeric_code,passportfrom_Country name,countryto_UN_GDP_currentPrices2016
17128,BJ,ID,1.0,Indonesia,IDN,360.0,Benin,932259
17759,GB,IL,1.0,Israel,ISR,376.0,United Kingdom,317748
38574,FJ,AE,0.0,United Arab Emirates,ARE,784.0,Fiji,348744
18010,OM,IT,0.0,Italy,ITA,380.0,Oman,1858913
30968,TO,WS,1.0,Samoa,WSM,882.0,Tonga,822
32547,TZ,SG,1.0,Singapore,SGP,702.0,"Tanzania, United Republic of",296946
20808,KI,LY,0.0,Libyan Arab Jamahiriya,LBY,434.0,Kiribati,42960
36877,TT,TT,0.0,Trinidad and Tobago,TTO,780.0,Trinidad and Tobago,24086
27615,TG,NO,0.0,Norway,NOR,578.0,Togo,371069
20767,PK,LY,0.0,Libyan Arab Jamahiriya,LBY,434.0,Pakistan,42960


## Intersect df2 and df_population_UN

In [42]:
df_population_UN2 = df_population_UN.copy()
df_population_UN2.rename(columns={'country_numeric_code': 'countryto_country_numeric_code'}, inplace=True)

In [43]:
df3 = pd.merge(df2, df_population_UN2, on=['countryto_country_numeric_code'], how='right')

In [44]:
df3 = df3.drop(['UN_population_countryName'], axis=1)

In [45]:
df3.rename(columns={'UN_population_Value_2015_millions': 'countryto_UN_population_Value_2015_millions'}, inplace=True)

In [46]:
df3.sample(10)

Unnamed: 0,passportfrom_alpha_2code,countryto_alpha_2code,visafree,countryto_Country name,countryto_country_alpha_3code,countryto_country_numeric_code,passportfrom_Country name,countryto_UN_GDP_currentPrices2016,countryto_UN_population_Value_2015_millions
31956,TZ,SC,1.0,Seychelles,SYC,690.0,"Tanzania, United Republic of",1434.0,0.09
5066,HT,BW,0.0,Botswana,BWA,72.0,Haiti,15566.0,2.21
14241,ZA,GH,1.0,Ghana,GHA,288.0,South Africa,42794.0,27.58
25307,NP,MM,0.0,Myanmar,MMR,104.0,Nepal,65698.0,52.4
29909,DO,PR,0.0,Puerto Rico,PRI,630.0,Dominican Republic,105035.0,3.67
28005,UY,PW,1.0,Palau,PLW,585.0,Uruguay,310.0,0.02
28132,FI,PW,1.0,Palau,PLW,585.0,Finland,310.0,0.02
23237,BE,MR,1.0,Mauritania,MRT,478.0,Belgium,4667.0,4.18
28781,ZM,PY,0.0,Paraguay,PRY,600.0,Zambia,27165.0,6.64
24024,BZ,MD,0.0,"Moldova, Republic of",MDA,498.0,Belize,6773.0,4.07


## Intersect df3 and df_surface_UN

In [47]:
df_surface_UN2 = df_surface_UN.copy()
df_surface_UN2.rename(columns={'country_numeric_code': 'countryto_country_numeric_code'}, inplace=True)

In [48]:
df4 = pd.merge(df3, df_surface_UN2, on=['countryto_country_numeric_code'], how='right')

In [49]:
df4 = df4.drop(['UN_surface_countryName'], axis=1)

In [50]:
df4.rename(columns={'Surface area (thousand km2)': 'countryto_Surface area (thousand km2)'}, inplace=True)
df4.rename(columns={'Year_Surface area': 'countryto_Year_Surface area'}, inplace=True)

In [51]:
df4.sample(5)

Unnamed: 0,passportfrom_alpha_2code,countryto_alpha_2code,visafree,countryto_Country name,countryto_country_alpha_3code,countryto_country_numeric_code,passportfrom_Country name,countryto_UN_GDP_currentPrices2016,countryto_UN_population_Value_2015_millions,countryto_Surface area (thousand km2),countryto_Year_Surface area
17129,BZ,ID,1.0,Indonesia,IDN,360.0,Belize,932259.0,258.16,1911.0,2015
38267,SR,AE,0.0,United Arab Emirates,ARE,784.0,Suriname,348744.0,9.15,84.0,2015
5632,LU,BN,1.0,Brunei Darussalam,BRN,96.0,Luxembourg,11400.0,0.42,6.0,2015
1794,YE,AW,0.0,Aruba,ABW,533.0,Yemen,2667.0,0.1,0.0,2015
17415,PG,IQ,0.0,Iraq,IRQ,368.0,Papua New Guinea,160021.0,36.12,435.0,2015


## Reorder the columns

In [52]:
cols = df4.columns.tolist()
cols

['passportfrom_alpha_2code',
 'countryto_alpha_2code',
 'visafree',
 'countryto_Country name',
 'countryto_country_alpha_3code',
 'countryto_country_numeric_code',
 'passportfrom_Country name',
 'countryto_UN_GDP_currentPrices2016',
 'countryto_UN_population_Value_2015_millions',
 'countryto_Surface area (thousand km2)',
 'countryto_Year_Surface area']

In [53]:
df4 = df4[['passportfrom_alpha_2code','passportfrom_Country name','countryto_alpha_2code','countryto_Country name',
  'countryto_country_alpha_3code',
 'countryto_country_numeric_code','countryto_UN_GDP_currentPrices2016',
 'countryto_UN_population_Value_2015_millions',
 'countryto_Surface area (thousand km2)',
 'countryto_Year_Surface area','visafree']]

In [54]:
df4.head()

Unnamed: 0,passportfrom_alpha_2code,passportfrom_Country name,countryto_alpha_2code,countryto_Country name,countryto_country_alpha_3code,countryto_country_numeric_code,countryto_UN_GDP_currentPrices2016,countryto_UN_population_Value_2015_millions,countryto_Surface area (thousand km2),countryto_Year_Surface area,visafree
0,AM,Armenia,AF,Afghanistan,AFG,4.0,20235.0,33.74,653.0,2015,0.0
1,AG,Antigua and Barbuda,AF,Afghanistan,AFG,4.0,20235.0,33.74,653.0,2015,0.0
2,AO,Angola,AF,Afghanistan,AFG,4.0,20235.0,33.74,653.0,2015,0.0
3,AR,Argentina,AF,Afghanistan,AFG,4.0,20235.0,33.74,653.0,2015,0.0
4,DZ,Algeria,AF,Afghanistan,AFG,4.0,20235.0,33.74,653.0,2015,0.0


## Check data types

In [55]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40409 entries, 0 to 40408
Data columns (total 11 columns):
passportfrom_alpha_2code                       40180 non-null object
passportfrom_Country name                      40385 non-null object
countryto_alpha_2code                          40188 non-null object
countryto_Country name                         40385 non-null object
countryto_country_alpha_3code                  40385 non-null object
countryto_country_numeric_code                 40409 non-null float64
countryto_UN_GDP_currentPrices2016             40388 non-null float64
countryto_UN_population_Value_2015_millions    40408 non-null float64
countryto_Surface area (thousand km2)          40409 non-null float64
countryto_Year_Surface area                    40409 non-null int64
visafree                                       40385 non-null float64
dtypes: float64(5), int64(1), object(5)
memory usage: 3.7+ MB


## Pandas pivot_table

In [56]:
df_pivot = pd.pivot_table(df4[df4.visafree == 1],index='passportfrom_Country name',values=['countryto_UN_GDP_currentPrices2016','countryto_UN_population_Value_2015_millions','countryto_Surface area (thousand km2)','visafree'], aggfunc=np.sum, margins=False,dropna=True)
df_pivot.sort_values('visafree', ascending=False)

Unnamed: 0_level_0,countryto_Surface area (thousand km2),countryto_UN_GDP_currentPrices2016,countryto_UN_population_Value_2015_millions,visafree
passportfrom_Country name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Japan,86556.0,64790088.0,5922.04,171.0
"Korea, Republic of (South Korea)",100204.0,55180294.0,4825.28,170.0
Singapore,90882.0,63845672.0,4744.92,170.0
France,79589.0,50540613.0,3312.55,168.0
Germany,80899.0,49471652.0,3267.30,168.0
Sweden,79253.0,52423812.0,3336.27,167.0
Finland,79354.0,52699785.0,3340.55,167.0
Denmark,79649.0,52631388.0,3340.34,167.0
Italy,79390.0,51079375.0,3286.53,167.0
Luxembourg,79358.0,52674381.0,3251.89,166.0


In [58]:
df_pivot

Unnamed: 0_level_0,countryto_Surface area (thousand km2),countryto_UN_GDP_currentPrices2016,countryto_UN_population_Value_2015_millions,visafree
passportfrom_Country name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,5233.0,440912.0,360.17,29.0
Albania,32007.0,23039214.0,1985.87,104.0
Algeria,14779.0,2903233.0,936.58,49.0
Andorra,73672.0,52441052.0,3130.15,149.0
Angola,15505.0,3238540.0,1105.37,48.0
Antigua and Barbuda,38165.0,26820347.0,2320.16,137.0
Argentina,66797.0,31736977.0,2957.13,155.0
Armenia,45208.0,6979278.0,1432.70,60.0
Australia,61600.0,48672020.0,2899.02,161.0
Austria,78493.0,51478500.0,3165.51,165.0


In [60]:
df_pivot_flattened = pd.DataFrame(df_pivot.to_records())
df_pivot_flattened

Unnamed: 0,passportfrom_Country name,countryto_Surface area (thousand km2),countryto_UN_GDP_currentPrices2016,countryto_UN_population_Value_2015_millions,visafree
0,Afghanistan,5233.0,440912.0,360.17,29.0
1,Albania,32007.0,23039214.0,1985.87,104.0
2,Algeria,14779.0,2903233.0,936.58,49.0
3,Andorra,73672.0,52441052.0,3130.15,149.0
4,Angola,15505.0,3238540.0,1105.37,48.0
5,Antigua and Barbuda,38165.0,26820347.0,2320.16,137.0
6,Argentina,66797.0,31736977.0,2957.13,155.0
7,Armenia,45208.0,6979278.0,1432.70,60.0
8,Australia,61600.0,48672020.0,2899.02,161.0
9,Austria,78493.0,51478500.0,3165.51,165.0


In [61]:
df_pivot_flattened['Surface_Rank'] = df_pivot_flattened['countryto_Surface area (thousand km2)'].rank(method='dense', ascending=False).astype(int)
df_pivot_flattened['GDP_Rank'] = df_pivot_flattened['countryto_UN_GDP_currentPrices2016'].rank(method='dense', ascending=False).astype(int)
df_pivot_flattened['Population_Rank'] = df_pivot_flattened['countryto_UN_population_Value_2015_millions'].rank(method='dense', ascending=False).astype(int)
df_pivot_flattened['visafree_countries_Rank'] = df_pivot_flattened['visafree'].rank(method='dense', ascending=False).astype(int)

In [62]:
df_pivot_flattened = df_pivot_flattened[['passportfrom_Country name',
                                         'visafree',
                                         'visafree_countries_Rank',
                                         'countryto_UN_GDP_currentPrices2016',
                                         'GDP_Rank',
                                         'countryto_Surface area (thousand km2)',
                                         'Surface_Rank',
                                         'countryto_UN_population_Value_2015_millions',
                                         'Population_Rank']]

In [67]:
df_pivot_flattened

Unnamed: 0,passportfrom_Country name,visafree,visafree_countries_Rank,countryto_UN_GDP_currentPrices2016,GDP_Rank,countryto_Surface area (thousand km2),Surface_Rank,countryto_UN_population_Value_2015_millions,Population_Rank
0,Afghanistan,29.0,104,440912.0,197,5233.0,196,360.17,195
1,Albania,104.0,52,23039214.0,90,32007.0,108,1985.87,96
2,Algeria,49.0,88,2903233.0,171,14779.0,162,936.58,172
3,Andorra,149.0,19,52441052.0,14,73672.0,30,3130.15,40
4,Angola,48.0,89,3238540.0,166,15505.0,156,1105.37,162
5,Antigua and Barbuda,137.0,27,26820347.0,80,38165.0,94,2320.16,82
6,Argentina,155.0,15,31736977.0,56,66797.0,41,2957.13,51
7,Armenia,60.0,79,6979278.0,120,45208.0,87,1432.7,121
8,Australia,161.0,10,48672020.0,38,61600.0,48,2899.02,55
9,Austria,165.0,6,51478500.0,23,78493.0,18,3165.51,33
