In [15]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import csv

### The Education Index

The UN maintains and publishes an education index which is used by them to compute the Human Development Index. Their dataset, though not perfect, seems much better than the one we got from the worldbank website; especially if we consider it from 1999 onwards. Given that the purpose of the HDI is humanitarian, it suggests to me that this is probably as good a dataset as we’ll find for African countries. It can be downloaded here http://hdr.undp.org/en/indicators/103706. It is defined by the following formula:

$$ EI = \frac{ \frac{EYS}{18} + \frac{MYS}{15} }{2} $$

where $EYS$ is expected years of schooling and $MYS$ is average number of education years students over the age of 25 have actually received.

In [89]:
# ei = pd.read_csv("data/Education_index.csv") 

# the commented line above was throwing parsing errors, I think it's because the csv for the EI
# is weirdly formatted.
# We parse it manually below, using python's csv module.
with open("data/Education_index.csv",  encoding='latin-1') as csvfile:
    ei = csv.reader(csvfile)
    raw = [row[1:] for row in list(ei)[6:-1]]
    # all the even (excluding 0th column) columns are empty, i'll remove them below
    formatted_rows = []
    for row in raw:
        formatted_rows.append([row[0]] + row[1::2])
    
# Maybe there is a simpler way to do this, but this also works.

In [85]:
# Adds column names and converts the list of data to a pandas DataFrame object
cols = ['country'] + list(range(1990, 2020))
education = pd.DataFrame(formatted_rows, columns=cols)

In [83]:
# This is what the Education Index table looks like from 1999 to 2019
education

Unnamed: 0,country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,0.122,0.133,0.145,0.156,0.168,0.179,0.190,0.202,0.213,...,0.372,0.374,0.390,0.398,0.403,0.405,0.406,0.408,0.413,0.414
1,Albania,0.583,0.588,0.557,0.542,0.541,0.550,0.557,0.569,0.579,...,0.671,0.714,0.739,0.749,0.758,0.753,0.745,0.747,0.743,0.746
2,Algeria,0.385,0.395,0.405,0.414,0.424,0.431,0.443,0.458,0.473,...,0.626,0.644,0.639,0.639,0.652,0.659,0.660,0.665,0.668,0.672
3,Andorra,..,..,..,..,..,..,..,..,..,...,0.670,0.671,0.724,0.714,0.725,0.718,0.722,0.713,0.720,0.720
4,Angola,..,..,..,..,..,..,..,..,..,...,0.398,0.423,0.435,0.447,0.460,0.472,0.487,0.498,0.500,0.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,Sub-Saharan Africa,0.285,0.290,0.297,0.304,0.310,0.319,0.326,0.332,0.340,...,0.422,0.429,0.437,0.449,0.452,0.457,0.460,0.465,0.466,0.471
202,Least Developed Countries,0.223,0.228,0.233,0.239,0.244,0.251,0.259,0.268,0.277,...,0.388,0.396,0.403,0.408,0.411,0.419,0.424,0.430,0.431,0.437
203,Small Island Developing States,0.466,0.471,0.477,0.483,0.485,0.492,0.500,0.506,0.512,...,0.607,0.613,0.604,0.609,0.615,0.621,0.624,0.628,0.627,0.633
204,Organization for Economic Co-operation and Dev...,0.679,0.688,0.681,0.709,0.720,0.729,0.735,0.728,0.747,...,0.818,0.824,0.828,0.835,0.841,0.846,0.850,0.851,0.852,0.852


## Worldbank data

The rest of our data comes from Worldbank's database. The Solow model

In [88]:
wb = pd.read_csv("data/b43c7fbd-11a9-4900-bce1-8748c6f68ad1_Data.csv") # worldbank data

In [None]:
X = df[''] # placeholder for regressors df
np.log(X) # natural log of all regressors (we don't log the constant)
X = sm.add_constant(X)
Y = df[''] # placeholder for dependent variable df
np.log(Y) # natural log of regressand

In [None]:
# estimate OLS
# compute heteroskedasticity-robust standard errors ('HC3')
model = sm.OLS(Y,X).fit(cov_type = 'HC3')

In [None]:
model.summary()
# result.bse
# result.t_test(...)