In [1]:
# Data manipulation imports
import pandas as pd
import numpy as np

# Graphing imports
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer, KNNImputer #imputer imports
from copy import copy, deepcopy #copy imports

# Modeling imports for imputing
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Read in data from datasets folder
df_education = pd.read_csv('datasets/education-by-country.csv')
df_gdp = pd.read_csv('datasets/GDP-by_country.csv')
df_happiness = pd.read_csv('datasets/happiness-by-country.csv')
df_life_expectancy = pd.read_csv('datasets/life-expectancy-by_country.csv')

In [3]:
df_education.head()

Unnamed: 0,Entity,Code,Year,Political Regime (OWID based on Polity IV and Wimmer & Min),"Average Total Years of Schooling for Adult Population (Lee-Lee (2016), Barro-Lee (2018) and UNDP (2018))",Year.1,"Total population (Gapminder, HYDE & UN)",Continent
0,Abkhazia,OWID_ABK,2015.0,,,,,Asia
1,Afghanistan,AFG,1816.0,-6.0,0.35,1970.0,3296506.0,
2,Afghanistan,AFG,1817.0,-6.0,0.35,1970.0,3302044.0,
3,Afghanistan,AFG,1818.0,-6.0,0.35,1970.0,3308390.0,
4,Afghanistan,AFG,1819.0,-6.0,0.35,1970.0,3315547.0,


In [4]:
df_education = df_education[df_education['Year'] == 2015]

df_education = df_education.rename(columns = {'Entity' : 'country',
                                              'Political Regime (OWID based on Polity IV and Wimmer & Min)' :'auto_demo',
                                              'Average Total Years of Schooling for Adult Population (Lee-Lee (2016), Barro-Lee (2018) and UNDP (2018))' : 'average_years_of_school',
                                              'Total population (Gapminder, HYDE & UN)' : 'total_population'})

df_education = df_education[['country', 'average_years_of_school', 'total_population', 'auto_demo']]
df_education.head()

Unnamed: 0,country,average_years_of_school,total_population,auto_demo
0,Abkhazia,,,
177,Afghanistan,0.35,34414000.0,-1.0
338,Africa,,1182439000.0,
343,Akrotiri and Dhekelia,,,
542,Albania,3.31,2891000.0,9.0


In [5]:
df_gdp.head()

Unnamed: 0,Entity,Code,Year,Output-side real GDP per capita (gdppc_o) (PWT 9.1 (2019)),Political Regime (OWID based on Polity IV and Wimmer & Min),"Total population (Gapminder, HYDE & UN)",Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1816,,-6.0,3296506.0,
2,Afghanistan,AFG,1817,,-6.0,3302044.0,
3,Afghanistan,AFG,1818,,-6.0,3308390.0,
4,Afghanistan,AFG,1819,,-6.0,3315547.0,


Only keep the 'Output-side real GDP per capita (gdppc_o) (PWT 9.1 (2019))' column for 2015 data, transfer to snake-case

In [6]:
df_gdp = df_gdp[df_gdp['Year'] == 2015]

df_gdp = df_gdp.rename(columns = {'Entity' : 'country',
                                  'Output-side real GDP per capita (gdppc_o) (PWT 9.1 (2019))' :  'gdp_per_cap'})

df_gdp = df_gdp[['country', 'gdp_per_cap']]
df_gdp.head()

Unnamed: 0,country,gdp_per_cap
0,Abkhazia,
177,Afghanistan,
338,Africa,
343,Akrotiri and Dhekelia,
389,Albania,11228.951


In [7]:
# Some of the data is already standard-scaled
df_happiness.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


Only keep the 'Happiness_Score' column, transfer to snake-case

In [8]:
df_happiness = df_happiness.rename(columns = {'Country' : 'country',
                                              'Happiness Score' : 'happiness',
                                              'Freedom' : 'freedom',
                                              'Trust (Government Corruption)' : 'less_corruption'})

df_happiness = df_happiness[['country', 'happiness', 'freedom', 'less_corruption']]
df_happiness.head()

Unnamed: 0,country,happiness,freedom,less_corruption
0,Switzerland,7.587,0.66557,0.41978
1,Iceland,7.561,0.62877,0.14145
2,Denmark,7.527,0.64938,0.48357
3,Norway,7.522,0.66973,0.36503
4,Canada,7.427,0.63297,0.32957


In [9]:
df_life_expectancy.head()

Unnamed: 0,Entity,Code,Year,Life expectancy
0,Afghanistan,AFG,1950,27.638
1,Afghanistan,AFG,1951,27.878
2,Afghanistan,AFG,1952,28.361
3,Afghanistan,AFG,1953,28.852
4,Afghanistan,AFG,1954,29.35


Only keep the 'Life expectancy' column for 2015 data, transfer to snake-case

In [10]:
df_life_expectancy = df_life_expectancy[df_life_expectancy['Year'] == 2015]

df_life_expectancy = df_life_expectancy.rename(columns = {'Entity' : 'country',
                                                          'Life expectancy' :  'life_expectancy'})

df_life_expectancy = df_life_expectancy[['country', 'life_expectancy']]
df_life_expectancy.head()

Unnamed: 0,country,life_expectancy
65,Afghanistan,63.377
137,Africa,61.607
207,Albania,78.025
280,Algeria,76.09
350,American Samoa,73.588


## Merge Dataframes
Merge all of the different datasets into one

In [11]:
gov = pd.merge(left = df_life_expectancy, right = df_gdp, on = 'country')
gov = pd.merge(left = gov, right = df_education, on = 'country' )
gov = pd.merge(left = gov, right = df_happiness, on = 'country')
gov.head()

Unnamed: 0,country,life_expectancy,gdp_per_cap,average_years_of_school,total_population,auto_demo,happiness,freedom,less_corruption
0,Afghanistan,63.377,,0.35,34414000.0,-1.0,3.575,0.23414,0.09719
1,Albania,78.025,11228.951,3.31,2891000.0,9.0,4.959,0.35733,0.06413
2,Algeria,76.09,12077.444,0.78,39728000.0,2.0,5.605,0.28579,0.17383
3,Angola,59.398,5530.3374,,27884000.0,-2.0,4.033,0.10384,0.07122
4,Argentina,76.068,16414.078,5.92,43075000.0,9.0,6.574,0.44974,0.08484


## Data overview
Check for overviews of the data before imputing

In [12]:
gov.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 0 to 147
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   country                  148 non-null    object 
 1   life_expectancy          148 non-null    float64
 2   gdp_per_cap              146 non-null    float64
 3   average_years_of_school  123 non-null    float64
 4   total_population         148 non-null    float64
 5   auto_demo                141 non-null    float64
 6   happiness                148 non-null    float64
 7   freedom                  148 non-null    float64
 8   less_corruption          148 non-null    float64
dtypes: float64(8), object(1)
memory usage: 11.6+ KB


In [13]:
gov.isnull().sum() #missing values for gdp, school, and auto_demo

country                     0
life_expectancy             0
gdp_per_cap                 2
average_years_of_school    25
total_population            0
auto_demo                   7
happiness                   0
freedom                     0
less_corruption             0
dtype: int64

In [14]:
gov.describe()

Unnamed: 0,life_expectancy,gdp_per_cap,average_years_of_school,total_population,auto_demo,happiness,freedom,less_corruption
count,148.0,146.0,123.0,148.0,141.0,148.0,148.0,148.0
mean,72.320358,18461.909873,3.802195,48384810.0,4.666667,5.403851,0.432827,0.144996
std,7.917526,18044.43841,2.663081,161524500.0,5.973354,1.159816,0.152598,0.120819
min,50.881,690.02197,0.01,330000.0,-10.0,2.839,0.0,0.0
25%,66.7045,4839.963525,1.355,4799000.0,1.0,4.542,0.331247,0.06301
50%,74.2825,12381.3665,3.26,11025000.0,7.0,5.277,0.438215,0.10722
75%,77.86725,25832.07825,5.565,34891000.0,9.0,6.29575,0.555402,0.182365
max,84.043,98941.203,10.69,1406848000.0,10.0,7.587,0.66973,0.55191


## Impute Missing Data and Export
I will just impute with the median given how there are only 27 missing values in the whole dataset (after I dropna for target rows)

In [15]:
gov['gdp_per_cap'] = gov['gdp_per_cap'].fillna(gov['gdp_per_cap'].dropna().median())
gov['average_years_of_school'] = gov['average_years_of_school'].fillna(gov['average_years_of_school'].dropna().median())
gov['auto_demo'] = gov['auto_demo'].fillna(gov['auto_demo'].dropna().median())
gov.isnull().sum().sum()

0

In [16]:
gov.to_csv('gov.csv', index = False)