### Libraries

In [209]:
import pandas as pd
import numpy as np 

import warnings
import os

from unidecode import unidecode

import wbgapi as wb

from sklearn.preprocessing import MinMaxScaler

In [84]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
warnings.filterwarnings("ignore")
os.chdir("C:\\Users\\joaos\\Documents\\GitHub\\pred_se_ml\data")

### Shadow economy data from Medina, L., & Schneider, F. (2017)

In [85]:
# Importing shadow economy data from Medina, L., & Schneider, F. (2017)
se_medina = pd.read_excel("shadow_economy_medina.xlsx")
se_medina.head()

Unnamed: 0,Country,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,Albania,43.18,40.18,39.45,40.07,39.18,37.07,37.59,38.16,36.04,35.3,36.04,33.67,32.64,31.72,30.89,29.58,28.53,27.12,26.91,26.1,25.41,25.52,25.68,25.78,26.21
1,Algeria,34.96,36.14,38.16,38.88,37.64,37.68,37.18,38.28,36.11,34.2,33.58,31.9,29.62,27.76,24.93,24.44,24.21,24.07,25.9,25.89,27.37,26.94,25.98,25.74,23.98
2,Angola,50.17,47.8,55.43,50.48,52.47,46.3,50.48,49.21,48.64,48.8,46.14,48.4,48.86,46.81,43.84,41.23,37.13,35.26,36.25,36.54,36.49,36.6,35.92,34.53,35.25
3,Argentina,25.22,24.41,26.59,26.22,27.18,25.32,25.2,24.0,25.83,25.4,26.94,26.19,25.37,24.32,23.21,22.63,21.93,21.87,22.97,21.64,20.8,21.62,21.57,22.02,24.99
4,Armenia,46.65,49.5,48.63,44.66,47.14,47.48,46.41,45.81,46.85,46.6,47.61,44.11,42.08,43.57,41.03,41.38,39.47,35.39,41.04,40.14,38.46,35.52,34.56,34.78,35.96


In [86]:
# Columns' names
se_medina.columns

Index(['Country ',       1991,       1992,       1993,       1994,       1995,
             1996,       1997,       1998,       1999,       2000,       2001,
             2002,       2003,       2004,       2005,       2006,       2007,
             2008,       2009,       2010,       2011,       2012,       2013,
             2014,       2015],
      dtype='object')

In [87]:
# Removing the blank space and renaming the Country column
se_medina = se_medina.rename(columns={'Country ': 'country'})

# Reshaping from wide format to long format
se_medina = se_medina.set_index(se_medina['country']).stack().reset_index(name='se_medina').rename(columns={'level_2': 'year'})
se_medina.head()

Unnamed: 0,country,level_1,se_medina
0,Albania,country,Albania
1,Albania,1991,43.18
2,Albania,1992,40.18
3,Albania,1993,39.45
4,Albania,1994,40.07


In [88]:
# Renaming the column 'level_1' to 'year'
se_medina = se_medina.rename(columns={'level_1':'year'})

# Converting the 'se_medina' column to numeric
se_medina["se_medina"] = pd.to_numeric(se_medina.se_medina, errors = 'coerce')

# Droping 'na' observations
se_medina = se_medina.dropna()

Due to different data sources, the countries names may vary from one source to another, bellow some countries' names going to be renamed for future data joining

In [89]:
se_medina['country'] = [x.replace('CentralAfricanRepublic', 'central_african_republic') for x in se_medina['country']]
se_medina['country'] = [x.replace('Congo, Dem, Rep,', 'congo_dem_rep') for x in se_medina['country']]
se_medina['country'] = [x.replace('GuineaBissau', 'guinea_bissau') for x in se_medina['country']]
se_medina['country'] = [x.replace('Hong Kong SAR, China', 'hong_kong') for x in se_medina['country']]
se_medina['country'] = [x.replace('Syrian Arab, Rep,', 'Syria') for x in se_medina['country']]
se_medina['country'] = [x.replace('Brunei Darussalam', 'brunei') for x in se_medina['country']]
se_medina['country'] = [x.replace('swaziland', 'eswatini') for x in se_medina['country']]

In [90]:
# Selecting only the first name of each country
se_medina['country'] = se_medina['country'].str.split(',').str[0].str.strip()

# Only lowercase letters
se_medina['country'] = se_medina['country'].str.lower()

# Removing accentuation
se_medina['country'] = se_medina['country'].apply(lambda x: unidecode(x))

# replacing whitespace with _ in country
se_medina['country'] = [x.replace(' ', '_') for x in se_medina['country']] 

### Collecting the dependent variables from the World Bank API

In [91]:
# List of the countries 
countries = ['ALB', 'DZA', 'AGO', 'ARG', 'ARM', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BLR', 'BEL', 'BLZ', 'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN',
            'BGR', 'BFA', 'BDI', 'CPV', 'KHM', 'CMR', 'CAN', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'COD', 'COG', 'CRI', 'CIV', 'HRV', 'CYP', 'CZE', 'DNK',
            'DOM', 'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'ETH', 'FJI', 'FIN', 'FRA', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GTM', 'GIN', 'GNB', 'GUY',
            'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ', 'KEN', 'KOR', 'KWT', 'KGZ', 'LAO', 'LVA',
            'LBN', 'LSO', 'LBR', 'LBY', 'LTU', 'LUX', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MRT', 'MUS', 'MEX', 'MDA', 'MNG', 'MAR', 'MOZ', 'MMR', 'NAM',
            'NPL', 'NLD', 'NZL', 'NIC', 'NER', 'NGA', 'NOR', 'OMN', 'PAK', 'PNG', 'PRY', 'PER', 'PHL', 'POL', 'PRT', 'QAT', 'ROU', 'RUS', 'RWA', 'SAU', 'SEN',
            'SLE', 'SGP', 'SVK', 'SVN', 'SLB', 'ZAF', 'ESP', 'LKA', 'SUR', 'SWZ', 'SWE', 'CHE', 'SYR', 'TJK', 'TZA', 'THA', 'TGO', 'TTO', 'TUN', 'TUR',
            'UGA', 'UKR', 'ARE', 'GBR', 'USA', 'URY', 'VEN', 'VNM', 'YEM', 'ZMB', 'ZWE']

In [92]:
# List of the variables
series = ['NY.GDP.DEFL.KD.ZG', # Inflation, GDP deflator (annual %)
          'SL.UEM.TOTL.NE.ZS', # Unemployment, total (% of total labor force) (national estimate)
          'NE.TRD.GNFS.ZS', # Trade (% of GDP)
          'BX.KLT.DINV.WD.GD.ZS', # Foreign direct investment, net inflows (% of GDP)
          'NE.CON.GOVT.ZS', # General government final consumption expenditure (% of GDP)
          'IC.REG.PROC', # Start-up procedures to register a business (number)
          'IC.REG.COST.PC.ZS', # Cost of business start-up procedures (% of GNI per capita)
          'IC.REG.DURS', # Time required to start a business (days)
          'IC.PRP.DURS', # Time required to register property (days)
          'IC.TAX.DURS', # Time to prepare and pay taxes (hours)
          'NY.GDP.PCAP.CD' # GDP per capita (current US$)
         ]

In [93]:
# Collecting the data
wb_variables = wb.data.DataFrame(series=series, 
                       economy=countries, 
                       time=range(1991, 2016, 1), 
                       labels=False).reset_index()

In [94]:
wb_variables.head()

Unnamed: 0,economy,series,YR1991,YR1992,YR1993,YR1994,YR1995,YR1996,YR1997,YR1998,YR1999,YR2000,YR2001,YR2002,YR2003,YR2004,YR2005,YR2006,YR2007,YR2008,YR2009,YR2010,YR2011,YR2012,YR2013,YR2014,YR2015
0,AGO,BX.KLT.DINV.WD.GD.ZS,6.388693,3.466081,4.964921,3.837037,8.529489,2.76277,5.36336,17.121191,40.167251,9.623866,24.009075,11.406192,20.081014,9.329239,-3.526657,-0.072001,-1.368762,1.896314,3.136661,-3.851112,-2.704873,-1.143768,-5.380131,2.690006,11.081339
1,AGO,IC.PRP.DURS,,,,,,,,,,,,,,335.0,335.0,335.0,335.0,335.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0
2,AGO,IC.REG.COST.PC.ZS,,,,,,,,,,,,,1316.4,910.0,653.8,498.2,343.7,196.8,151.1,226.6,163.1,143.1,130.1,118.8,17.0
3,AGO,IC.REG.DURS,,,,,,,,,,,,,83.0,83.0,83.0,83.0,83.0,68.0,68.0,66.0,66.0,66.0,66.0,66.0,36.0
4,AGO,IC.REG.PROC,,,,,,,,,,,,,12.0,12.0,12.0,12.0,12.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0


In [95]:
# Selecionando o intervalo de tempo em anos
years = []

# The range goes from 1991 to 2015
for year in range(1991, 2016, 1):
    year = "YR" + str(year)
    years.append(year)
    
# Transform from wide to long format
wb_variables = pd.melt(wb_variables, id_vars=['economy', 'series'], value_vars=years, var_name='year', value_name='value')
wb_variables = wb_variables.pivot(index=['economy', 'year'], columns='series', values='value').reset_index()

In [96]:
# Removing 'YR' in each row of the year's column
wb_variables['year'] = [x.replace('YR', '') for x in wb_variables['year']]

In [97]:
# Renaming the columns
wb_variables = wb_variables.rename(columns={'NY.GDP.DEFL.KD.ZG':'inflation',
                                        'SL.UEM.TOTL.NE.ZS':'unemployment',
                                        'NE.TRD.GNFS.ZS':'exchange',
                                        'BX.KLT.DINV.WD.GD.ZS':'fdi',
                                        'NE.CON.GOVT.ZS':'governement_spending',
                                        'IC.REG.PROC':'business_procedure',
                                        'IC.REG.COST.PC.ZS':'cost_procedures',
                                        'IC.REG.DURS':'business_time',
                                        'IC.PRP.DURS':'property_time',
                                        'IC.TAX.DURS': 'tribute_time',
                                        'NY.GDP.PCAP.CD':'gdp_pc'
                                         })

In [98]:
# Missing data by column
wb_variables.isnull().mean().sort_values(ascending=False)*100

series
tribute_time            59.057325
property_time           56.127389
cost_procedures         52.789809
business_time           52.789809
business_procedure      52.789809
unemployment            38.828025
governement_spending    11.159236
exchange                 8.840764
fdi                      2.471338
inflation                2.394904
gdp_pc                   1.324841
economy                  0.000000
year                     0.000000
dtype: float64

In [99]:
# Saving countries name
countries_name = wb.economy.DataFrame(countries).reset_index()

# Only id and name columns
countries_name = countries_name[['id', 'name']]

# Renaming the id columns
countries_name = countries_name.rename(columns={'id': 'economy'})


In [100]:
# Merging countries_name and wb_variables
wb_variables = pd.merge(wb_variables, countries_name, on='economy')

In [101]:
wb_variables.head()

Unnamed: 0,economy,year,fdi,property_time,cost_procedures,business_time,business_procedure,tribute_time,governement_spending,exchange,inflation,gdp_pc,unemployment,name
0,AGO,1991,6.388693,,,,,,,,106.309982,850.55618,,Angola
1,AGO,1992,3.466081,,,,,,,,476.515751,657.65464,,Angola
2,AGO,1993,4.964921,,,,,,,,917.783468,466.679163,,Angola
3,AGO,1994,3.837037,,,,,,,,2175.978955,329.691784,,Angola
4,AGO,1995,8.529489,,,,,,,,1825.495149,398.120223,,Angola


In [102]:
# Just changing the columns' order
new_order = ['name', 'economy', 'year', 'fdi', 'property_time', 'cost_procedures', 'business_time', 'business_procedure', 
             'tribute_time', 'governement_spending', 'exchange', 'inflation', 'gdp_pc', 'unemployment']

wb_variables = wb_variables[new_order]

In [103]:
wb_variables.head()

Unnamed: 0,name,economy,year,fdi,property_time,cost_procedures,business_time,business_procedure,tribute_time,governement_spending,exchange,inflation,gdp_pc,unemployment
0,Angola,AGO,1991,6.388693,,,,,,,,106.309982,850.55618,
1,Angola,AGO,1992,3.466081,,,,,,,,476.515751,657.65464,
2,Angola,AGO,1993,4.964921,,,,,,,,917.783468,466.679163,
3,Angola,AGO,1994,3.837037,,,,,,,,2175.978955,329.691784,
4,Angola,AGO,1995,8.529489,,,,,,,,1825.495149,398.120223,


### Democracy data

In [104]:
# Importing democracy data
democracy = pd.read_excel("democracy.xlsx", usecols=['country', 'year', 'democ'])
democracy.head()

Unnamed: 0,country,year,democ
0,Afghanistan,1800,1
1,Afghanistan,1801,1
2,Afghanistan,1802,1
3,Afghanistan,1803,1
4,Afghanistan,1804,1


In [105]:
# Selecting the 1991 to 2015 year range
democracy = democracy[(democracy['year'] >= 1991) & (democracy['year'] <= 2015)]
democracy.head()

Unnamed: 0,country,year,democ
191,Afghanistan,1991,0
192,Afghanistan,1992,-77
193,Afghanistan,1993,-77
194,Afghanistan,1994,-77
195,Afghanistan,1995,-77


In [106]:
# Verifying which countries are in wb_variables and democracy 
list1 = wb_variables['name'].unique()
list2 = democracy['country'].unique()

set1 = set(list1)
set2 = set(list2)

equal_elements = set1.intersection(set2)
different_elements = set1.symmetric_difference(set2)

different_elements

{'Afghanistan',
 'Bahamas, The',
 'Belize',
 'Bosnia',
 'Bosnia and Herzegovina',
 'Brunei Darussalam',
 'Cabo Verde',
 'Cape Verde',
 'Congo Brazzaville',
 'Congo Kinshasa',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Congo-Brazzaville',
 "Cote D'Ivoire",
 "Cote d'Ivoire",
 'Cuba',
 'Czech Republic',
 'Czechia',
 'Czechoslovakia',
 'Djibouti',
 'Egypt',
 'Egypt, Arab Rep.',
 'Eswatini',
 'Gambia',
 'Gambia, The',
 'Hong Kong SAR, China',
 'Iceland',
 'Iran',
 'Iran, Islamic Rep.',
 'Iraq',
 'Ivory Coast',
 'Korea North',
 'Korea South',
 'Korea, Rep.',
 'Kosovo',
 'Kyrgyz Republic',
 'Kyrgyzstan',
 'Lao PDR',
 'Laos',
 'Macedonia',
 'Maldives',
 'Malta',
 'Montenegro',
 'Myanmar',
 'Myanmar (Burma)',
 'Panama',
 'Russia',
 'Russian Federation',
 'Serbia',
 'Serbia and Montenegro',
 'Somalia',
 'South Sudan',
 'Sudan',
 'Sudan-North',
 'Swaziland',
 'Syria',
 'Syrian Arab Republic',
 'Taiwan',
 'Timor Leste',
 'Turkey',
 'Turkiye',
 'Turkmenistan',
 'UAE',
 'USSR',
 'United Arab Emirates',


Some of the countries shown above has different names depending on the data source, bellow these countries will be renamed to have the same name

In [107]:
democracy['country'] = [x.replace('Ivory Coast', "Cote d'Ivoire") for x in democracy['country']]
democracy['country'] = [x.replace("Cote D'Ivoire", "Cote d'Ivoire") for x in democracy['country']]
democracy['country'] = [x.replace("Congo-Brazzaville", "Congo Brazzaville") for x in democracy['country']]
democracy['country'] = [x.replace("Congo Brazzaville", "Congo, Rep.") for x in democracy['country']]
democracy['country'] = [x.replace("Congo Kinshasa", "Congo, Dem. Rep.") for x in democracy['country']]
democracy['country'] = [x.replace("Korea South", "Korea, Rep.") for x in democracy['country']]
democracy['country'] = [x.replace("Cape Verde", "Cabo Verde") for x in democracy['country']]
democracy['country'] = [x.replace("Czech Republic", "Czechia") for x in democracy['country']]
democracy['country'] = [x.replace("Egypt", "Egypt, Arab Rep.") for x in democracy['country']]
democracy['country'] = [x.replace("Swaziland", "Eswatini") for x in democracy['country']]
democracy['country'] = [x.replace("Gambia", "Gambia, The") for x in democracy['country']]
democracy['country'] = [x.replace("Iran", "Iran, Islamic Rep.") for x in democracy['country']]
democracy['country'] = [x.replace("Kyrgyzstan", "Kyrgyz Republic") for x in democracy['country']]
democracy['country'] = [x.replace("Laos", "Lao PDR") for x in democracy['country']]
democracy['country'] = [x.replace("Myanmar (Burma)", "Myanmar") for x in democracy['country']]
democracy['country'] = [x.replace("Russia", "Russian Federation") for x in democracy['country']]
democracy['country'] = [x.replace("Syria", "Syrian Arab Republic") for x in democracy['country']]
democracy['country'] = [x.replace("Turkey", "Turkiye") for x in democracy['country']]
democracy['country'] = [x.replace("UAE", "United Arab Emirates") for x in democracy['country']]
democracy['country'] = [x.replace("Venezuela", "Venezuela, RB") for x in democracy['country']]
democracy['country'] = [x.replace("Yemen", "Yemen, Rep.") for x in democracy['country']]
democracy['country'] = [x.replace("Egypt, Arab Rep.", "Egypt, Arab Rep.") for x in democracy['country']]

In [108]:
# Removing countries that are in democracy but not in variables_wb and removing other countries in specific 
countries_to_drop = ['Korea North', 'Afghanistan', 'Bosnia', 'Cuba', 'Czechoslovakia','Djibouti', 'Iraq', 'Kosovo', 'Macedonia',
                    'Maldives', 'Montenegro', 'Panama', 'Serbia and Montenegro', 'Yugoslavia', 'Somalia',
                     'South Sudan', 'Sudan', 'Sudan-North', 'Taiwan', 'Timor Leste', 'USSR', 'Uzbekistan',
                    'Serbia', 'Turkmenistan']

democracy = democracy[~democracy['country'].isin(countries_to_drop)]

In [109]:
# removing countries that are in wb_variables but not in democracy
countries_to_drop2 = ['Bahamas, The', 'Belize', 'Bosnia and Herzegovina', 'Brunei Darussalam', 'Hong Kong SAR, China',
                      'Iceland', 'Maldives', 'Malta']

wb_variables = wb_variables[~wb_variables['name'].isin(countries_to_drop2)]

In [110]:
# Renaming the column country in democracy for a merging with the wb_variables data frame
democracy = democracy.rename(columns={'country':'name'})

# Object to int
wb_variables['year'] = wb_variables['year'].astype('int64')

# Merging wb_variables and democracy
df1 = pd.merge(wb_variables, democracy, left_on=['name', 'year'], right_on=['name', 'year'])

In [111]:
df1.head()

Unnamed: 0,name,economy,year,fdi,property_time,cost_procedures,business_time,business_procedure,tribute_time,governement_spending,exchange,inflation,gdp_pc,unemployment,democ
0,Angola,AGO,1991,6.388693,,,,,,,,106.309982,850.55618,,-88
1,Angola,AGO,1992,3.466081,,,,,,,,476.515751,657.65464,,-77
2,Angola,AGO,1993,4.964921,,,,,,,,917.783468,466.679163,,-88
3,Angola,AGO,1994,3.837037,,,,,,,,2175.978955,329.691784,,-88
4,Angola,AGO,1995,8.529489,,,,,,,,1825.495149,398.120223,,-88


### Tax burden

In [112]:
# Importing the tax burden data
tax_burden = pd.read_csv('tax_burden.csv', usecols=['Name', 'Index Year', 'Tax Burden'])

# Sorting the data
tax_burden = tax_burden.sort_values(['Name', 'Index Year'], ascending=[True, True])

# Renaming the columns
tax_burden = tax_burden.rename(columns={'Name':'name', 
                         'Index Year': 'year',
                         'Tax burden': 'carga_trib'})

# Selecting the range between 1995 and 2015
tax_burden = tax_burden[(tax_burden['year'] >= 1995) & (tax_burden['year'] <= 2015)]

In [113]:
# Checking which different elements (countries) are in the df1 and tax_burden
list1 = df1['name'].unique()
list2 = tax_burden['name'].unique()

set1 = set(list1)
set2 = set(list2)

equal_elements = set1.intersection(set2)
different_elements = set1.symmetric_difference(set2)

different_elements

{'Afghanistan',
 'Bangladesh',
 'Bangladesh ',
 'Barbados',
 'Belize',
 'Bosnia and Herzegovina',
 'Brunei Darussalam',
 'Burma',
 'Cape Verde',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Costa Rica',
 'Costa Rica ',
 "Cote d'Ivoire",
 'Cuba',
 'Czech Republic',
 'Czechia',
 "Côte d'Ivoire ",
 'Democratic Republic of Congo',
 'Djibouti',
 'Dominica',
 'Egypt',
 'Egypt, Arab Rep.',
 'El Salvador',
 'El Salvador ',
 'Eswatini',
 'Gambia, The',
 'Guatemala',
 'Guatemala ',
 'Honduras',
 'Honduras ',
 'Hong Kong',
 'Hungary',
 'Hungary ',
 'Iceland',
 'Iran',
 'Iran, Islamic Rep.',
 'Iraq',
 'Jamaica',
 'Jamaica ',
 'Kiribati',
 'Korea, Rep.',
 'Kosovo',
 'Kyrgyz Republic',
 'Kyrgyz Republic ',
 'Lao PDR',
 'Laos',
 'Liechtenstein',
 'Macau',
 'Macedonia',
 'Malaysia',
 'Malaysia ',
 'Maldives',
 'Malta',
 'Micronesia',
 'Montenegro',
 'Mozambique',
 'Mozambique ',
 'Myanmar',
 'Netherlands',
 'Nicaragua',
 'Nicaragua ',
 'North Korea',
 'Pakistan',
 'Pakistan ',
 'Panama ',
 'Paraguay',
 'Par

Repeting the same process made in the democracy data


In [114]:
# Removing the blank space at the end of the countries' name in each row
tax_burden['name'] = tax_burden['name'].str.rstrip()

In [115]:
tax_burden['name'] = [x.replace('Cape Verde', "Cabo Verde") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Burma', "Myanmar") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Turkey', "Turkiye") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('The Netherlands', "Netherlands") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Democratic Republic of Congo', "Congo, Dem. Rep.") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Republic of Congo', "Congo, Rep.") for x in tax_burden['name']]
tax_burden['name'] = [x.replace("Côte d'Ivoire", "Cote d'Ivoire") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Czech Republic', "Czechia") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Egypt', "Egypt, Arab Rep.") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Swaziland', "Eswatini") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('The Gambia', "Gambia, The") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Iran', "Iran, Islamic Rep.") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('South Korea', "Korea, Rep.") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Laos', "Lao PDR") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('The Philippines', "Philippines") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Russia', "Russian Federation") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Slovakia', "Slovak Republic") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Syria', "Syrian Arab Republic") for x in tax_burden['name']]
tax_burden['name'] = [x.replace('Burma', "Myanmar") for x in tax_burden['name']]
tax_burden['name'] = [x.replace("Venezuela", "Venezuela, RB") for x in tax_burden['name']] 
tax_burden['name'] = [x.replace("Yemen", "Yemen, Rep.") for x in tax_burden['name']]

In [116]:
# Countries that are in tax_burden but not in df1
countries_to_drop3 = ['Afghanistan', 'Barbados', 'Belize', 'Bosnia and Herzegovina', 'Brunei Darussalam', 'Cuba', 'Djibouti', 
                      'Dominica', 'Hong Kong', 'Iceland', 'Iraq', 'Kiribati', 'Kosovo', 'Liechtenstein', 'Macau', 'Macedonia', 
                      'Maldives', 'Malta', 'Micronesia', 'Montenegro', 'North Korea', 'Panama', 'Saint Lucia', 
                      'Saint Vincent and the Grenadines', 'Samoa', 'Serbia', 'Seychelles', 'Somalia', 'Sudan', 
                      'São Tomé and Príncipe', 'Taiwan', 'The Bahamas', 'Timor-Leste', 'Tonga', 'Turkmenistan', 
                      'Uzbekistan', 'Vanuatu']

tax_burden = tax_burden[~tax_burden['name'].isin(countries_to_drop3)]

In [117]:
# Merging df1 and tax_burden
df2 = pd.merge(df1, tax_burden, left_on=['name', 'year'], right_on=['name', 'year'])
df2 = df2.rename(columns={'Tax Burden': 'tax_burden'})

### Diversity of exports

In [118]:
# Importing the data
diversity = pd.read_csv('export_diversity.csv')

# Droping some columns
columns_to_drop = ['Country Code', 'Indicator Name', 'Indicator Code', 'Attribute', 'Unnamed: 29']
diversity = diversity.drop(columns=columns_to_drop)

# Renaming the column
diversity = diversity.rename(columns={'Country Name':'name'})

# Reshaping
diversity = pd.melt(diversity, id_vars='name', var_name='year', value_name='diversity')

# Sorting the data
diversity = diversity.sort_values(['name', 'year'], ascending=[True, True])

Repeting the process made in the previous data sets

In [119]:
list1 = diversity['name'].unique()
list2 = df2['name'].unique()

set1 = set(list1)
set2 = set(list2)

equal_elements = set1.intersection(set2)
different_elements = set1.symmetric_difference(set2)

different_elements

{'Afghanistan, Islamic Republic of',
 'Andorra',
 'Anguilla',
 'Antigua and Barbuda',
 'Armenia',
 'Armenia, Republic of',
 'Aruba',
 'Azerbaijan',
 'Azerbaijan, Republic of',
 'Bahamas, The',
 'Bahrain',
 'Bahrain, Kingdom of',
 'Barbados',
 'Belgium-Luxembourg',
 'Belize',
 'Bermuda',
 'Bosnia and Herzegovina',
 'Brunei Darussalam',
 'Cayman Islands',
 'China',
 'China, P.R.: Hong Kong',
 'China, P.R.: Macao',
 'China, P.R.: Mainland',
 'Congo, Dem. Rep.',
 'Congo, Democratic Republic of',
 'Congo, Rep.',
 'Congo, Republic of',
 'Cuba',
 'Czech Republic',
 'Czechia',
 'Djibouti',
 'Dominica',
 'Egypt',
 'Egypt, Arab Rep.',
 'Eswatini',
 'Faroe Islands',
 'French Territories: French Polynesia',
 'French Territories: New Caledonia',
 'Gibraltar',
 'Greenland',
 'Grenada',
 'Iceland',
 'Iran, Islamic Rep.',
 'Iran, Islamic Republic of',
 'Iraq',
 'Kiribati',
 "Korea, Democratic People's Rep. of",
 'Korea, Rep.',
 'Korea, Republic of',
 'Lao PDR',
 "Lao People's Democratic Republic",
 'M

In [120]:
diversity['name'] = [x.replace('Armenia, Republic of', "Armenia") for x in diversity['name']]
diversity['name'] = [x.replace('Azerbaijan, Republic of', "Azerbaijan") for x in diversity['name']]
diversity['name'] = [x.replace('Bahrain, Kingdom of', "Bahrain") for x in diversity['name']]
diversity['name'] = [x.replace('China, P.R.: Mainland', "China") for x in diversity['name']]
diversity['name'] = [x.replace('Congo, Democratic Republic of', "Congo, Dem. Rep.") for x in diversity['name']]
diversity['name'] = [x.replace('Congo, Republic of', "Congo, Rep.") for x in diversity['name']]
diversity['name'] = [x.replace('Czech Republic', "Czechia") for x in diversity['name']]
diversity['name'] = [x.replace('Egypt', "Egypt, Arab Rep.") for x in diversity['name']]
diversity['name'] = [x.replace('Swaziland', "Eswatini") for x in diversity['name']]
diversity['name'] = [x.replace('Iran, Islamic Republic of', "Iran, Islamic Rep.") for x in diversity['name']]
diversity['name'] = [x.replace('Korea, Republic of', "Korea, Rep.") for x in diversity['name']]
diversity['name'] = [x.replace( "Lao People's Democratic Republic", 'Lao PDR') for x in diversity['name']]
diversity['name'] = [x.replace('Turkey', "Turkiye") for x in diversity['name']]
diversity['name'] = [x.replace('Venezuela, Republica Bolivariana de', "Venezuela, RB") for x in diversity['name']]
diversity['name'] = [x.replace('Yemen, Republic of', "Yemen, Rep.") for x in diversity['name']]

In [121]:
countries_to_drop4 = ['Afghanistan, Islamic Republic of', 'Andorra', 'Anguilla', 'Antigua and Barbuda', 'Aruba', 'Bahamas, The',
                      'Barbados', 'Belgium-Luxembourg', 'Belize', 'Bermuda', 'Bosnia and Herzegovina', 'Brunei Darussalam', 
                      'Cayman Islands', 'China, P.R.: Hong Kong', 'China, P.R.: Macao', 'Cuba', 'Djibouti', 'Dominica', 
                      'Faroe Islands', 'French Territories: French Polynesia', 'French Territories: New Caledonia', 'Gibraltar',
                      'Greenland', 'Grenada', 'Iceland', 'Iraq', 'Kiribati', "Korea, Democratic People's Rep. of", 'Maldives', 'Malta', 'Marshall Islands, Republic of', 'Montserrat', 
                      'Netherlands Antilles', 'North Macedonia, Republic of', 'Panama', 'Samoa', 'Sao Tome and Principe', 
                      'Serbia and Montenegro', 'Seychelles', 'Somalia', 'South Sudan', 'St. Kitts and Nevis', 'St. Lucia', 
                      'St. Vincent and the Grenadines', 'Sudan', 'Tonga', 'Turkmenistan', 'Tuvalu', 'Uzbekistan', 'Vanuatu', 
                      'Virgin Islands, British']

diversity = diversity[~diversity['name'].isin(countries_to_drop4)]

In [122]:
# Converting the year column in int64 since it is not in ideal format
diversity['year'] = diversity['year'].astype('int64')

In [123]:
# Merging df2 and diversity
df3 = pd.merge(df2, diversity, left_on=['name', 'year'], right_on=['name', 'year'])
df3 = df3.rename(columns={'diversity':'diversity'})

In [124]:
df3.head()

Unnamed: 0,name,economy,year,fdi,property_time,cost_procedures,business_time,business_procedure,tribute_time,governement_spending,exchange,inflation,gdp_pc,unemployment,democ,tax_burden,diversity
0,Angola,AGO,1995,8.529489,,,,,,,,1825.495149,398.120223,,-88,61.6,6.002369
1,Angola,AGO,1996,2.76277,,,,,,,,4800.531644,454.375004,,-88,54.6,6.04876
2,Angola,AGO,1997,5.36336,,,,,,,,95.453022,516.127849,,0,52.6,5.991146
3,Angola,AGO,1998,17.121191,,,,,,,,39.359348,423.403332,,0,59.1,5.882152
4,Angola,AGO,1999,40.167251,,,,,,,,557.501113,387.689415,,0,47.9,5.903543


### Quality of exports

In [125]:
# Importing the data
quality = pd.read_excel("quality.xlsx")

# Renaming the country column
quality = quality.rename(columns={'country':'name'})

# Reshaping
quality = pd.melt(quality, id_vars='name', var_name='year', value_name='quality')

In [126]:
# checking the df info
quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3960 entries, 0 to 3959
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     3960 non-null   object 
 1   year     3960 non-null   object 
 2   quality  3853 non-null   float64
dtypes: float64(1), object(2)
memory usage: 92.9+ KB


In [127]:
# Transforming the year column into int64
quality['year'] = quality['year'].astype('int64')

In [128]:
list1 = quality['name'].unique()
list2 = df3['name'].unique()

set1 = set(list1)
set2 = set(list2)

equal_elements = set1.intersection(set2)
different_elements = set1.symmetric_difference(set2)

different_elements

{'Antigua and Barbuda',
 'Armenia',
 'Armenia, Republic of',
 'Azerbaijan',
 'Azerbaijan, Republic of',
 'Bahamas, The',
 'Bahrain',
 'Bahrain, Kingdom of',
 'Barbados',
 'Belize',
 'Bhutan',
 'Bosnia and Herzegovina',
 'China',
 'China, P.R.: Hong Kong',
 'China, P.R.: Mainland',
 'Congo, Dem. Rep.',
 'Congo, Democratic Republic of',
 'Congo, Rep.',
 'Congo, Republic of',
 'Croatia',
 'Czech Republic',
 'Czechia',
 'Djibouti',
 'Dominica',
 'Egypt',
 'Egypt, Arab Rep.',
 'Eritrea',
 'Eswatini',
 'Grenada',
 'Guyana',
 'Iceland',
 'Iran, Islamic Rep.',
 'Iran, Islamic Republic of',
 'Iraq',
 'Korea, Rep.',
 'Korea, Republic of',
 'Lao PDR',
 "Lao People's Democratic Republic",
 'Libya',
 'Maldives',
 'Malta',
 'North Macedonia, Republic of',
 'Panama',
 'Papua New Guinea',
 'Sao Tome and Principe',
 'Seychelles',
 'Solomon Islands',
 'St. Kitts and Nevis',
 'St. Lucia',
 'St. Vincent and the Grenadines',
 'Sudan',
 'Swaziland',
 'Turkey',
 'Turkiye',
 'Turkmenistan',
 'Uzbekistan',
 'V

In [129]:
quality['name'] = [x.replace('Armenia, Republic of', "Armenia") for x in quality['name']]
quality['name'] = [x.replace('Azerbaijan, Republic of', "Azerbaijan") for x in quality['name']]
quality['name'] = [x.replace('Bahrain, Kingdom of', "Bahrain") for x in quality['name']]
quality['name'] = [x.replace('China, P.R.: Mainland', "China") for x in quality['name']]
quality['name'] = [x.replace('Congo, Republic of', "Congo, Rep.") for x in quality['name']]
quality['name'] = [x.replace('Congo, Democratic Republic of', "Congo, Dem. Rep.") for x in quality['name']]
quality['name'] = [x.replace('Czech Republic', "Czechia") for x in quality['name']]
quality['name'] = [x.replace('Egypt', "Egypt, Arab Rep.") for x in quality['name']]
quality['name'] = [x.replace('Swaziland', "Eswatini") for x in quality['name']]
quality['name'] = [x.replace('Iran, Islamic Republic of', "Iran, Islamic Rep.") for x in quality['name']]
quality['name'] = [x.replace('Korea, Republic of', "Korea, Rep.") for x in quality['name']]
quality['name'] = [x.replace("Lao People's Democratic Republic", 'Lao PDR') for x in quality['name']]
quality['name'] = [x.replace('Turkey', "Turkiye") for x in quality['name']]
quality['name'] = [x.replace('Venezuela, Republica Bolivariana de', 'Venezuela, RB') for x in quality['name']]
quality['name'] = [x.replace('Yemen, Republic of', 'Yemen, Rep.') for x in quality['name']]

In [130]:
countries_to_drop5 = ['Antigua and Barbuda', 'Bahamas, The', 'Barbados', 'Belize', 'Bhutan', 'Bosnia and Herzegovina', 
                      'China, P.R.: Hong Kong', 'Croatia', 'Djibouti', 'Dominica', 'Eritrea', 'Grenada', 'Guyana', 'Iceland', 
                      'Iraq', 'Libya', 'Maldives', 'Malta', 'North Macedonia, Republic of', 'Panama', 'Papua New Guinea', 
                      'Sao Tome and Principe', 'Seychelles', 'Solomon Islands', 'St. Kitts and Nevis', 'St. Lucia', 
                      'St. Vincent and the Grenadines', 'Sudan', 'Turkmenistan', 'Uzbekistan']

quality = quality[~quality['name'].isin(countries_to_drop5)]

In [131]:

countries_to_drop6 = ['Bhutan', 'Croatia', 'Eritrea', 'Guyana', 'Libya', 'Papua New Guinea', 'Solomon Islands']
df3 = df3[~df3['name'].isin(countries_to_drop6)]

In [132]:
df4 = pd.merge(df3, quality, left_on=['name', 'year'], right_on=['name', 'year'])
df4 = df4.rename(columns={'quality':'quality'})

In [133]:
df4['name'] = [x.replace('Central African Republic', 'central_african_republic') for x in df4['name']]
df4['name'] = [x.replace('Congo, Dem. Rep.', 'congo_dem_rep') for x in df4['name']]
df4['name'] = [x.replace('Guinea-Bissau', 'guinea_bissau') for x in df4['name']]
df4['name'] = [x.replace('Turkiye', 'turkey') for x in df4['name']]
df4['name'] = [x.replace('Syrian Arab Republic', 'syria') for x in df4['name']]
df4['name'] = [x.replace('Czechia', 'czech_republic') for x in df4['name']]
df4['name'] = [x.replace('Lao PDR', 'laos') for x in df4['name']]

In [134]:
# Selecting only the first name of each country
df4['name'] = df4['name'].str.split(',').str[0].str.strip()

# Only lowercase letters
df4['name'] = df4['name'].str.lower()

# removing accentuation
df4['name'] = df4['name'].apply(lambda x: unidecode(x)) 

# replacing blank space by _
df4['name'] = [x.replace(' ', '_') for x in df4['name']]

### Merging the shadow economy data (target variable) with the other variables

In [135]:
list1 = se_medina['country'].unique()
list2 = df4['name'].unique()

set1 = set(list1)
set2 = set(list2)

equal_elements = set1.intersection(set2)
different_elements = set1.symmetric_difference(set2)

different_elements

{'bahamas',
 'belize',
 'bhutan',
 'bosnia_and_herzegovina',
 'brunei',
 'croatia',
 'eritrea',
 'eswatini',
 'guyana',
 'hong_kong',
 'iceland',
 'libya',
 'maldives',
 'malta',
 'papua_new_guinea',
 'solomon_islands',
 'swaziland',
 'taiwan',
 'vietnam'}

In [136]:
list1 = se_medina['country'].unique()
list2 = df4['name'].unique()

set1 = set(list1)
set2 = set(list2)

equal_elements = set1.intersection(set2)
different_elements = set1.symmetric_difference(set2)

# Exibindo elementos diferentes e seus conjuntos
for element in different_elements:
    if element in set1:
        print(f'{element} is in set 1')
    else:
        print(f'{element} is in set 2')

brunei is in set 1
vietnam is in set 1
eritrea is in set 1
papua_new_guinea is in set 1
bahamas is in set 1
eswatini is in set 2
iceland is in set 1
bhutan is in set 1
solomon_islands is in set 1
guyana is in set 1
taiwan is in set 1
libya is in set 1
malta is in set 1
croatia is in set 1
swaziland is in set 1
maldives is in set 1
hong_kong is in set 1
bosnia_and_herzegovina is in set 1
belize is in set 1


In [137]:
# removing the countries that are in se_medina but not in df4
countries_to_drop7 = ['bahamas','belize','bhutan','bosnia_and_herzegovina','brunei','croatia','eritrea','guyana','hong_kong',
                      'iceland','libya','maldives','malta','papua_new_guinea','solomon_islands','taiwan']

se_medina = se_medina[~se_medina['country'].isin(countries_to_drop7)]

In [138]:
# Renaming the rows that write swaziland by eswatini
se_medina['country'] = [x.replace('swaziland', 'eswatini') for x in se_medina['country']]

# renaming the column
df4 = df4.rename(columns={'name':'country'})

In [139]:
# merging df4 and se_medina
df5 = pd.merge(df4, se_medina, left_on=['country', 'year'], right_on=['country', 'year'])

In [140]:
# Droping the economy column since it's unnecesary
df5 = df5.drop('economy', axis=1)

### Dealing with missing data

In [141]:
df5.head()

Unnamed: 0,country,year,fdi,property_time,cost_procedures,business_time,business_procedure,tribute_time,governement_spending,exchange,inflation,gdp_pc,unemployment,democ,tax_burden,diversity,quality,se_medina
0,angola,1995,8.529489,,,,,,,,1825.495149,398.120223,,-88,61.6,6.002369,0.37,52.47
1,angola,1996,2.76277,,,,,,,,4800.531644,454.375004,,-88,54.6,6.04876,0.4,46.3
2,angola,1997,5.36336,,,,,,,,95.453022,516.127849,,0,52.6,5.991146,0.35,50.48
3,angola,1998,17.121191,,,,,,,,39.359348,423.403332,,0,59.1,5.882152,0.34,49.21
4,angola,1999,40.167251,,,,,,,,557.501113,387.689415,,0,47.9,5.903543,0.34,48.64


In [142]:
# Missing data by column (%)
df5.isnull().mean().sort_values(ascending=False)*100

tribute_time            53.687943
property_time           49.645390
cost_procedures         45.638298
business_time           45.638298
business_procedure      45.638298
unemployment            35.141844
governement_spending     8.758865
exchange                 6.773050
tax_burden               4.645390
quality                  1.205674
fdi                      0.638298
inflation                0.638298
gdp_pc                   0.177305
diversity                0.000000
country                  0.000000
democ                    0.000000
year                     0.000000
se_medina                0.000000
dtype: float64

There is a lot of missing data, especially in relation to the time tribute_time, however, if we limit the year range to 2014, the proportion of missing data will decrease

In [143]:
df5 = df5[(df5['year']>=2004) & (df5['year'] <= 2014)]

In [144]:
# porportion of missing data after removing the 2015 year
df5.isnull().mean().sort_values(ascending=False)*100

unemployment            30.496454
tribute_time            15.796260
property_time            8.446164
cost_procedures          8.446164
business_time            8.446164
business_procedure       8.446164
governement_spending     6.963250
exchange                 5.673759
tax_burden               1.289491
fdi                      0.193424
democ                    0.000000
quality                  0.000000
diversity                0.000000
country                  0.000000
gdp_pc                   0.000000
inflation                0.000000
year                     0.000000
se_medina                0.000000
dtype: float64

In [145]:
# Proportion of missing data by country in the column 'unemployment' since it's the variable with the biggest proportion of missing data
missing_by_country = df5.groupby('country')['unemployment'].apply(lambda x: x.isnull().sum() / x.count()).sort_values(ascending=False)
#print(missing_by_country)

In [146]:
# Collecting the names of countries that have a proportion of missing data in the unemployment column above 5%
#countries = missing_by_country[missing_by_country > 5].index.tolist()
#countries

# Removing these countries
#df5 = df5[~df5['country'].isin(countries)]

In [147]:
len(df5['country'].unique())

141

In [148]:
# Replacing missing values ​​with the average for each country for each attribute
mean_by_country = df5.groupby('country').transform('mean')
df5.fillna(mean_by_country, inplace=True)

In [149]:
df5.head(11)

Unnamed: 0,country,year,fdi,property_time,cost_procedures,business_time,business_procedure,tribute_time,governement_spending,exchange,inflation,gdp_pc,unemployment,democ,tax_burden,diversity,quality,se_medina
9,angola,2004,9.329239,335.0,910.0,83.0,12.0,289.0,14.312063,103.579947,33.443595,1254.696126,23.643,2,84.916689,6.330621,0.4,46.81
10,angola,2005,-3.526657,335.0,653.8,83.0,12.0,284.0,16.029037,106.590962,42.374249,1900.723816,12.6408,2,84.916689,6.245515,0.42,43.84
11,angola,2006,-0.072001,335.0,498.2,83.0,12.0,284.0,15.341722,94.625159,17.115665,2597.963585,12.6408,2,84.9,6.295336,0.4,41.23
12,angola,2007,-1.368762,335.0,343.7,83.0,12.0,284.0,15.536935,108.060068,4.308432,3121.348735,12.6408,2,85.0,6.211946,0.45,37.13
13,angola,2008,1.896314,335.0,196.8,68.0,8.0,284.0,16.814612,121.364708,19.365774,4081.717497,12.6408,2,85.1751,6.315526,0.47,35.26
14,angola,2009,3.136661,190.0,151.1,68.0,8.0,284.0,19.898586,122.446144,-16.76214,3123.698898,3.782,2,85.1751,6.278077,0.44,36.25
15,angola,2010,-3.851112,190.0,226.6,66.0,8.0,294.0,17.042346,104.123635,32.270469,3586.66368,9.43,2,85.1,6.255853,0.42,36.54
16,angola,2011,-2.704873,190.0,163.1,66.0,8.0,294.0,18.235859,99.982506,31.77146,4608.155166,16.77,2,84.5,6.25497,0.45,36.49
17,angola,2012,-1.143768,190.0,143.1,66.0,8.0,294.0,17.842633,91.800097,7.25575,5083.826851,12.6408,2,84.1,6.106215,0.46,36.6
18,angola,2013,-5.380131,190.0,130.1,66.0,8.0,294.0,21.621949,86.811933,2.839724,5061.34924,12.6408,2,82.6,6.126807,0.42,35.92


In [150]:
df5.isnull().mean().sort_values(ascending=False)*100

unemployment            4.964539
governement_spending    4.964539
exchange                4.255319
inflation               0.000000
quality                 0.000000
diversity               0.000000
tax_burden              0.000000
democ                   0.000000
gdp_pc                  0.000000
country                 0.000000
year                    0.000000
tribute_time            0.000000
business_procedure      0.000000
business_time           0.000000
cost_procedures         0.000000
property_time           0.000000
fdi                     0.000000
se_medina               0.000000
dtype: float64

After the imputation, 'government spending' and 'exchange' still having missing values, this happens because some countries have the 'inf' as observations and not a usual 

In [151]:
# This is seen with the follow code for governement_spending and after for exchange (just change governement_spending by exchange to see all the countries)
missing_by_country = df5.groupby('country')['governement_spending'].apply(lambda x: x.isnull().sum() / x.count()).sort_values(ascending=False)
#print(missing_by_country)

In [152]:
# Droping these countries
countries_to_drop8 = ['malawi', 'trinidad_and_tobago', 'liberia', 'yemen', "cote_d'ivoire", "nigeria"]
df5 = df5[~df5['country'].isin(countries_to_drop8)]

In [153]:
# No more missing values
df5.isnull().mean().sort_values(ascending=False)

unemployment            0.051852
exchange                0.014815
governement_spending    0.014815
inflation               0.000000
quality                 0.000000
diversity               0.000000
tax_burden              0.000000
democ                   0.000000
gdp_pc                  0.000000
country                 0.000000
year                    0.000000
tribute_time            0.000000
business_procedure      0.000000
business_time           0.000000
cost_procedures         0.000000
property_time           0.000000
fdi                     0.000000
se_medina               0.000000
dtype: float64

### Creating variables

In [156]:
df5.head()

Unnamed: 0,country,year,fdi,property_time,cost_procedures,business_time,business_procedure,tribute_time,governement_spending,exchange,inflation,gdp_pc,unemployment,democ,tax_burden,diversity,quality,se_medina
9,angola,2004,9.329239,335.0,910.0,83.0,12.0,289.0,14.312063,103.579947,33.443595,1254.696126,23.643,closed_anocracy,84.916689,6.330621,0.4,46.81
10,angola,2005,-3.526657,335.0,653.8,83.0,12.0,284.0,16.029037,106.590962,42.374249,1900.723816,12.6408,closed_anocracy,84.916689,6.245515,0.42,43.84
11,angola,2006,-0.072001,335.0,498.2,83.0,12.0,284.0,15.341722,94.625159,17.115665,2597.963585,12.6408,closed_anocracy,84.9,6.295336,0.4,41.23
12,angola,2007,-1.368762,335.0,343.7,83.0,12.0,284.0,15.536935,108.060068,4.308432,3121.348735,12.6408,closed_anocracy,85.0,6.211946,0.45,37.13
13,angola,2008,1.896314,335.0,196.8,68.0,8.0,284.0,16.814612,121.364708,19.365774,4081.717497,12.6408,closed_anocracy,85.1751,6.315526,0.47,35.26


In [155]:
# Renaming the classes in democ column
df5['democ'] = df5['democ'].replace(-88, 'transition')
df5['democ'] = df5['democ'].replace(-77, 'interregnum')
df5['democ'] = df5['democ'].replace(-66, 'interruption')
df5['democ'] = df5['democ'].replace(0, 'autocracy')
df5['democ'] = df5['democ'].replace(1, 'closed_anocracy')
df5['democ'] = df5['democ'].replace(2, 'closed_anocracy')
df5['democ'] = df5['democ'].replace(3, 'open_anocracy')
df5['democ'] = df5['democ'].replace(4, 'open_anocracy')
df5['democ'] = df5['democ'].replace(5, 'open_anocracy')
df5['democ'] = df5['democ'].replace(6, 'open_anocracy')
df5['democ'] = df5['democ'].replace(7, 'democracy')
df5['democ'] = df5['democ'].replace(8, 'democracy')
df5['democ'] = df5['democ'].replace(9, 'democracy')
df5['democ'] = df5['democ'].replace(10, 'full_democracy')

In [175]:
# Creating dummies regarding the democ variables
df6 = pd.get_dummies(df5['democ'], prefix='democ', drop_first='True')
df6 = df6.astype(int)
df6 = pd.concat([df5, df6], axis=1)

In [177]:
#  Calculate the natural logarithm of the 'gdp_pc' column
df6['lgdp_pc'] = np.log(df6['gdp_pc'])

In [180]:
# Droping columns that will not be used
df6 = df6.drop(['country', 'year', 'gdp_pc'], axis=1)

In [181]:
df6.head()

Unnamed: 0,fdi,property_time,cost_procedures,business_time,business_procedure,tribute_time,governement_spending,exchange,inflation,unemployment,democ,tax_burden,diversity,quality,se_medina,democ_closed_anocracy,democ_democracy,democ_full_democracy,democ_interregnum,democ_interruption,democ_open_anocracy,democ_transition,lgdp_pc
9,9.329239,335.0,910.0,83.0,12.0,289.0,14.312063,103.579947,33.443595,23.643,closed_anocracy,84.916689,6.330621,0.4,46.81,1,0,0,0,0,0,0,7.134649
10,-3.526657,335.0,653.8,83.0,12.0,284.0,16.029037,106.590962,42.374249,12.6408,closed_anocracy,84.916689,6.245515,0.42,43.84,1,0,0,0,0,0,0,7.54999
11,-0.072001,335.0,498.2,83.0,12.0,284.0,15.341722,94.625159,17.115665,12.6408,closed_anocracy,84.9,6.295336,0.4,41.23,1,0,0,0,0,0,0,7.862483
12,-1.368762,335.0,343.7,83.0,12.0,284.0,15.536935,108.060068,4.308432,12.6408,closed_anocracy,85.0,6.211946,0.45,37.13,1,0,0,0,0,0,0,8.04602
13,1.896314,335.0,196.8,68.0,8.0,284.0,16.814612,121.364708,19.365774,12.6408,closed_anocracy,85.1751,6.315526,0.47,35.26,1,0,0,0,0,0,0,8.314273


In [197]:

# Renaming the columns for better visualization
rename_dictionary = {'se_medina':'Shadow Economy',
                        'lgdp_pc': 'Log. GDP Per Capita', 
                         'democ_transition': 'Dem. Transition', 
                         'democ_interruption': 'Dem. Interruption',
                        'democ_interregnum':'Dem. Interregnum',
                        'democ_full_democracy': 'Dem. Full',
                        'democ_democracy':'Democracy',
                        'democ_autocracy':'Autocracy',
                        'democ_closed_anocracy':'Closed Anocracy',
                        'democ_open_anocracy': 'Open Anocracy',
                        'quality':'Quality',
                        'diversity':'Diversity',
                        'tax_burden':'Tax Burden',
                        'unemployment':'Unemployment',
                        'inflation':'Inflation',
                        'exchange':'Exchange',
                        'governement_spending':'Government Spending',
                         'tribute_time':'Tribute Time',
                        'business_procedure':'Business Procedure',
                        'business_time':'Business Time',
                        'cost_procedures':'Cost Procedures',
                        'property_time':'Property Time',
                        'fdi':'FDI'}

df6 = df6.rename(columns=rename_dictionary)
df6.head()


Unnamed: 0,FDI,Property Time,Cost Procedures,Business Time,Business Procedure,Tribute Time,Government Spending,Exchange,Inflation,Unemployment,democ,Tax Burden,Diversity,Quality,Shadow Economy,Closed Anocracy,Democracy,Dem. Full,Dem. Interregnum,Dem. Interruption,Open Anocracy,Dem. Transition,Log. GDP Per Capita
9,9.329239,335.0,910.0,83.0,12.0,289.0,14.312063,103.579947,33.443595,23.643,closed_anocracy,84.916689,6.330621,0.4,46.81,1,0,0,0,0,0,0,7.134649
10,-3.526657,335.0,653.8,83.0,12.0,284.0,16.029037,106.590962,42.374249,12.6408,closed_anocracy,84.916689,6.245515,0.42,43.84,1,0,0,0,0,0,0,7.54999
11,-0.072001,335.0,498.2,83.0,12.0,284.0,15.341722,94.625159,17.115665,12.6408,closed_anocracy,84.9,6.295336,0.4,41.23,1,0,0,0,0,0,0,7.862483
12,-1.368762,335.0,343.7,83.0,12.0,284.0,15.536935,108.060068,4.308432,12.6408,closed_anocracy,85.0,6.211946,0.45,37.13,1,0,0,0,0,0,0,8.04602
13,1.896314,335.0,196.8,68.0,8.0,284.0,16.814612,121.364708,19.365774,12.6408,closed_anocracy,85.1751,6.315526,0.47,35.26,1,0,0,0,0,0,0,8.314273


In [198]:
# droping the column 'democ'
df6 = df6.drop('democ', axis=1)

In [206]:
# Changing the order
new_order2 = ['Shadow Economy', 'FDI', 'Property Time', 'Cost Procedures', 'Business Time', 'Business Procedure',
               'Tribute Time', 'Government Spending', 'Exchange', 'Inflation', 'Unemployment', 'Tax Burden', 
               'Diversity', 'Quality', 'Closed Anocracy', 'Democracy', 'Dem. Full', 'Dem. Interregnum', 
               'Dem. Interruption', 'Open Anocracy', 'Dem. Transition', 'Log. GDP Per Capita']

df6 = df6.reindex(columns=new_order2)

In [208]:
df6.head()

Unnamed: 0,Shadow Economy,FDI,Property Time,Cost Procedures,Business Time,Business Procedure,Tribute Time,Government Spending,Exchange,Inflation,Unemployment,Tax Burden,Diversity,Quality,Closed Anocracy,Democracy,Dem. Full,Dem. Interregnum,Dem. Interruption,Open Anocracy,Dem. Transition,Log. GDP Per Capita
9,46.81,9.33,335.0,910.0,83.0,12.0,289.0,14.31,103.58,33.44,23.64,84.92,6.33,0.4,1,0,0,0,0,0,0,7.13
10,43.84,-3.53,335.0,653.8,83.0,12.0,284.0,16.03,106.59,42.37,12.64,84.92,6.25,0.42,1,0,0,0,0,0,0,7.55
11,41.23,-0.07,335.0,498.2,83.0,12.0,284.0,15.34,94.63,17.12,12.64,84.9,6.3,0.4,1,0,0,0,0,0,0,7.86
12,37.13,-1.37,335.0,343.7,83.0,12.0,284.0,15.54,108.06,4.31,12.64,85.0,6.21,0.45,1,0,0,0,0,0,0,8.05
13,35.26,1.9,335.0,196.8,68.0,8.0,284.0,16.81,121.36,19.37,12.64,85.18,6.32,0.47,1,0,0,0,0,0,0,8.31


### Data Padronization

In [217]:
scaler = MinMaxScaler()
final_df = scaler.fit_transform(df6.values)
final_df = pd.DataFrame(final_df, columns=df6.columns)

In [218]:
final_df.head()

Unnamed: 0,Shadow Economy,FDI,Property Time,Cost Procedures,Business Time,Business Procedure,Tribute Time,Government Spending,Exchange,Inflation,Unemployment,Tax Burden,Diversity,Quality,Closed Anocracy,Democracy,Dem. Full,Dem. Interregnum,Dem. Interruption,Open Anocracy,Dem. Transition,Log. GDP Per Capita
0,0.65,0.2,0.48,0.61,0.12,0.58,0.11,0.3,0.2,0.47,0.66,0.78,1.0,0.22,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33
1,0.6,0.16,0.48,0.44,0.12,0.58,0.11,0.34,0.2,0.54,0.35,0.78,0.98,0.24,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.39
2,0.56,0.17,0.48,0.33,0.12,0.58,0.11,0.32,0.17,0.34,0.35,0.78,0.99,0.22,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.44
3,0.49,0.17,0.48,0.23,0.12,0.58,0.11,0.33,0.21,0.23,0.35,0.78,0.98,0.28,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.46
4,0.46,0.18,0.48,0.13,0.1,0.37,0.11,0.36,0.24,0.35,0.35,0.78,1.0,0.3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
