In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

%matplotlib inline

In [2]:
rgdp_df = pd.read_csv("data/Real_GDP_Growth_original.csv", na_values=['(NA)']).fillna(0)

In [3]:
rgdp_df.head(10)

Unnamed: 0,Real GDP growth (Annual percent change),1980,1981,1982,1983,1984,1985,1986,1987,1988,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghanistan,no data,no data,no data,no data,no data,no data,no data,no data,no data,...,14.0,5.7,2.7,1.0,2.2,2.9,2.7,3.0,-3.0,4.5
2,Albania,2.7,5.7,2.9,1.1,2,-1.5,5.6,-0.8,-1.4,...,1.4,1.0,1.8,2.2,3.3,3.8,4.1,2.2,-5.0,8.0
3,Algeria,-5.4,3,6.4,5.4,5.6,5.6,-0.2,-0.7,-1.9,...,3.4,2.8,3.8,3.7,3.2,1.3,1.4,0.7,-5.2,6.2
4,Angola,2.4,-4.4,0,4.2,6,3.5,2.9,4.1,6.1,...,8.5,5.0,4.8,0.9,-2.6,-0.2,-1.2,-1.5,-1.4,2.6
5,Antigua and Barbuda,8.2,3.8,-0.1,5.4,10.2,7.6,11.5,6.6,5.2,...,3.4,-0.6,3.8,3.8,5.5,3.1,7.4,5.3,-10.0,8.0
6,Argentina,0.7,-5.7,-3.1,3.7,2,-7,7.1,2.5,-2,...,-1.0,2.4,-2.5,2.7,-2.1,2.7,-2.5,-2.2,-5.7,4.4
7,Armenia,no data,no data,no data,no data,no data,no data,no data,no data,no data,...,7.1,3.4,3.6,3.3,0.2,7.5,5.2,7.6,-1.5,4.8
8,Aruba,no data,no data,no data,no data,no data,no data,no data,no data,no data,...,-1.4,4.2,0.9,-0.4,0.5,2.3,1.2,0.4,-13.7,12.1
9,Australia,2.9,4.1,0.1,-0.5,6.3,5.5,2.4,4.9,4.3,...,3.8,2.1,2.6,2.3,2.8,2.5,2.7,1.8,-6.7,6.1


In [4]:
# Pulling out unnecessary columns
rgdp_df = rgdp_df[['Real GDP growth (Annual percent change)','2008','2009','2010',
                   '2011','2012','2013','2014','2015','2016','2017','2018']]

# Changing first column name to country
rgdp_df.rename(columns = {'Real GDP growth (Annual percent change)': 'country'}, inplace = True)

# Changing "no data" string to NaN (this was causing issues in other sections)
cols = rgdp_df.columns
rgdp_df[cols].replace('no data', np.NaN, inplace = True)


# rgdp_df.set_index('country', inplace = True)
rgdp_df = rgdp_df.dropna()

In [5]:
# Create a column calculating the trend of the growth rate between 2008 to 2018
temp = rgdp_df[['2008','2009','2010', '2011','2012','2013','2014','2015','2016','2017','2018']]
temp = temp.rename(columns = {"2008":2008, "2009":2009, "2010":2010, "2011":2011, "2012":2012,
                             "2013":2013, "2014":2014, "2015":2015, "2016":2016, "2017":2017, "2018":2018})
temp
# slope = temp.apply(lambda x: np.polyfit(x.dropna().index,x.dropna(),1)[0],axis=1)
# slope

Unnamed: 0,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,0,0,0,0,0,0,0,0,0,0,0
1,3.9,20.6,8.4,6.5,14,5.7,2.7,1,2.2,2.9,2.7
2,7.5,3.4,3.7,2.5,1.4,1,1.8,2.2,3.3,3.8,4.1
3,2.4,1.6,3.6,2.8,3.4,2.8,3.8,3.7,3.2,1.3,1.4
4,11.2,0.9,4.9,3.5,8.5,5,4.8,0.9,-2.6,-0.2,-1.2
...,...,...,...,...,...,...,...,...,...,...,...
224,1.8,-0.8,5.9,3.4,2.2,2.5,3,2.3,2.6,3,2.6
225,5.8,3.9,7.1,5.1,4.7,5.2,5.1,3.2,1.4,3,3.3
226,3,-0.1,5.4,4.3,3.5,3.5,3.6,3.5,3.4,3.9,3.6
227,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# Polyfit was not working, need to convert testing column values
temp[2009].str.isnumeric()

# Found that columns were needed to be converted to float types instead of objects
cols = temp.columns[temp.dtypes.eq('object')]
temp[cols] = temp[cols].apply(pd.to_numeric, errors='coerce')
# temp.dtypes

In [7]:
# Now that I've converted to float types, polyfit now works
slope = temp.apply(lambda x: np.polyfit(x.dropna().index, x.dropna(),1)[0],axis=1)
slope

0      0.000000
1     -1.070000
2     -0.152727
3     -0.047273
4     -0.889091
         ...   
224    0.071818
225   -0.332727
226    0.104545
227    0.000000
228    0.000000
Length: 229, dtype: float64

In [8]:
rgdp_df.loc[:,'bf_gdp_growth_rate'] = slope
rgdp_df

Unnamed: 0,country,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,bf_gdp_growth_rate
0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000
1,Afghanistan,3.9,20.6,8.4,6.5,14,5.7,2.7,1,2.2,2.9,2.7,-1.070000
2,Albania,7.5,3.4,3.7,2.5,1.4,1,1.8,2.2,3.3,3.8,4.1,-0.152727
3,Algeria,2.4,1.6,3.6,2.8,3.4,2.8,3.8,3.7,3.2,1.3,1.4,-0.047273
4,Angola,11.2,0.9,4.9,3.5,8.5,5,4.8,0.9,-2.6,-0.2,-1.2,-0.889091
...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,Other advanced economies,1.8,-0.8,5.9,3.4,2.2,2.5,3,2.3,2.6,3,2.6,0.071818
225,Sub-Saharan Africa,5.8,3.9,7.1,5.1,4.7,5.2,5.1,3.2,1.4,3,3.3,-0.332727
226,World,3,-0.1,5.4,4.3,3.5,3.5,3.6,3.5,3.4,3.9,3.6,0.104545
227,0,0,0,0,0,0,0,0,0,0,0,0,0.000000


In [9]:
# converting years in a single column with datetime format
years = [str(x) for x in range(2008, 2019)] 
rgdp2_df = rgdp_df.melt(id_vars = ['country','bf_gdp_growth_rate'], value_vars = years, var_name = 'year',
                          value_name = 'Real_GDP_growth')
rgdp2_df['year'] = pd.to_datetime(rgdp2_df['year'], format = '%Y')
rgdp2_df.set_index('country')
rgdp2_df

Unnamed: 0,country,bf_gdp_growth_rate,year,Real_GDP_growth
0,0,0.000000,2008-01-01,0
1,Afghanistan,-1.070000,2008-01-01,3.9
2,Albania,-0.152727,2008-01-01,7.5
3,Algeria,-0.047273,2008-01-01,2.4
4,Angola,-0.889091,2008-01-01,11.2
...,...,...,...,...
2514,Other advanced economies,0.071818,2018-01-01,2.6
2515,Sub-Saharan Africa,-0.332727,2018-01-01,3.3
2516,World,0.104545,2018-01-01,3.6
2517,0,0.000000,2018-01-01,0


In [10]:
# Set the index to country
# rgdp2_df.set_index('country', inplace = True)

# Reorder the columns
column_titles = ['country', 'year','Real_GDP_growth','bf_gdp_growth_rate']
rgdp2_df = rgdp2_df.reindex(columns=column_titles)
rgdp2_df

Unnamed: 0,country,year,Real_GDP_growth,bf_gdp_growth_rate
0,0,2008-01-01,0,0.000000
1,Afghanistan,2008-01-01,3.9,-1.070000
2,Albania,2008-01-01,7.5,-0.152727
3,Algeria,2008-01-01,2.4,-0.047273
4,Angola,2008-01-01,11.2,-0.889091
...,...,...,...,...
2514,Other advanced economies,2018-01-01,2.6,0.071818
2515,Sub-Saharan Africa,2018-01-01,3.3,-0.332727
2516,World,2018-01-01,3.6,0.104545
2517,0,2018-01-01,0,0.000000


In [12]:
gini_df = pd.read_csv("data/final_gini_table_with_new_variables.csv")
gini_df['year'] = pd.to_datetime(gini_df['year'], format = '%Y-%m-%d')
rgdp2_df['Real_GDP_growth'] = rgdp2_df['Real_GDP_growth'].apply(pd.to_numeric, errors='coerce')
# rgdp2_df.dtypes
# rgdp2_df['year'] = pd.to_datetime(rgdp_df['year'], format = '%Y-%m-%d')
# rgdp_df.dtypes
# gini_df.set_index(['country','year'], inplace = True)
# rgdp2_df.dtypes

In [13]:
merger = rgdp2_df[['country', 'year','Real_GDP_growth', 'bf_gdp_growth_rate']]
gdp_growth_df = gini_df.merge(merger, how = 'left', on = ['country', 'year'])
gdp_growth_df

Unnamed: 0.1,Unnamed: 0,country,code3,code2,region,sub_region,gini_trend,country_class,year,gini,trade_ratio,gdp,trade,Real_GDP_growth,bf_gdp_growth_rate
0,0,Afghanistan,AFG,AF,Asia,Southern Asia,-0.40,developing,2009-01-01,,49.663426,1.243909e+10,6.177677e+09,20.6,-1.070000
1,1,Armenia,ARM,AM,Asia,Western Asia,0.10,developing,2009-01-01,31.76,57.274236,8.647937e+09,4.953040e+09,-14.1,0.596364
2,2,Australia,AUS,AU,Oceania,Australia and New Zealand,0.24,developed,2009-01-01,,45.797904,9.278052e+11,4.249153e+11,1.9,0.012727
3,3,Austria,AUT,AT,Europe,Western Europe,-0.78,developed,2009-01-01,34.48,87.062231,4.001723e+11,3.483989e+11,-3.8,0.243636
4,4,Bangladesh,BGD,BD,Asia,Southern Asia,0.23,developing,2009-01-01,,40.092796,1.024778e+11,4.108621e+10,5.3,0.235455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,1145,Uruguay,URY,UY,Americas,South America,-0.62,developing,2018-01-01,39.10,39.992244,5.959689e+10,2.383413e+10,1.6,-0.569091
1146,1146,Venezuela,VEN,VE,Americas,South America,0.23,developing,2018-01-01,,,,,-19.6,-2.284545
1147,1147,Vietnam,VNM,VN,Asia,South-eastern Asia,0.32,developing,2018-01-01,42.40,208.306666,2.452137e+11,5.107965e+11,7.1,0.145455
1148,1148,West Bank and Gaza,PSE,PS,Asia,Western Asia,-0.38,,2018-01-01,,79.600982,1.461590e+10,1.163440e+10,,


In [14]:
gdp_growth_df.to_csv("data/real_gdp_growth_new.csv")