In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import warnings

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'lifeexpectancy'
table_name = 'lifeexpectancy'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

lifeexpectancy_df = pd.read_sql_query('select * from lifeexpectancy',con=engine)

# no need for an open connection, 
# as we're only doing a single query
engine.dispose()

In [2]:
# What does the data look like?
lifeexpectancy_df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [3]:
# What types of data are in the df?
lifeexpectancy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
Country                            2938 non-null object
Year                               2938 non-null int64
Status                             2938 non-null object
Life expectancy                    2928 non-null float64
Adult Mortality                    2928 non-null float64
infant deaths                      2938 non-null int64
Alcohol                            2744 non-null float64
percentage expenditure             2938 non-null float64
Hepatitis B                        2385 non-null float64
Measles                            2938 non-null int64
 BMI                               2904 non-null float64
under-five deaths                  2938 non-null int64
Polio                              2919 non-null float64
Total expenditure                  2712 non-null float64
Diphtheria                         2919 non-null float64
 HIV/AIDS                          2938 non-null

In [4]:
# how many nulls in the set?
lifeexpectancy_df.isnull().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

In [5]:
# separate the columns that have nulls for imputation
cols_with_null = []
for col in lifeexpectancy_df.columns:
    if lifeexpectancy_df[col].isnull().sum() > 0:
        cols_with_null.append(col)
print(cols_with_null)

['Life expectancy ', 'Adult Mortality', 'Alcohol', 'Hepatitis B', ' BMI ', 'Polio', 'Total expenditure', 'Diphtheria ', 'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling']


In [6]:
# determine the % of nulls in each column
lifeexpectancy_df.isnull().sum()*100/lifeexpectancy_df.isnull().count()

Country                             0.000000
Year                                0.000000
Status                              0.000000
Life expectancy                     0.340368
Adult Mortality                     0.340368
infant deaths                       0.000000
Alcohol                             6.603131
percentage expenditure              0.000000
Hepatitis B                        18.822328
Measles                             0.000000
 BMI                                1.157250
under-five deaths                   0.000000
Polio                               0.646698
Total expenditure                   7.692308
Diphtheria                          0.646698
 HIV/AIDS                           0.000000
GDP                                15.248468
Population                         22.191967
 thinness  1-19 years               1.157250
 thinness 5-9 years                 1.157250
Income composition of resources     5.684139
Schooling                           5.547992
dtype: flo

In [7]:
# imputation will be done by country
countries = lifeexpectancy_df["Country"].unique()

In [8]:
lifeexpectancy_df.Country.value_counts()

Sierra Leone                                            16
Croatia                                                 16
Slovakia                                                16
Central African Republic                                16
Democratic Republic of the Congo                        16
Mali                                                    16
Guinea                                                  16
Mozambique                                              16
Denmark                                                 16
Suriname                                                16
South Africa                                            16
Thailand                                                16
Solomon Islands                                         16
Grenada                                                 16
Saint Vincent and the Grenadines                        16
Slovenia                                                16
Australia                                               

In [9]:
# make a copy in case something bad happens!
le2 = lifeexpectancy_df.copy()

# imputation using each country's mean value
for col in cols_with_null:
    for country in countries:
        le2.loc[le2["Country"] == country, col].fillna(le2[le2["Country"] == country][col].mean(), inplace=True)

In [10]:
# how much effect did imputation have?
for col in cols_with_null:
    print("Statistics for columns: {}".format(col))
    print(pd.concat([lifeexpectancy_df[col], le2[col]], axis=1).describe())

Statistics for columns: Life expectancy 
       Life expectancy   Life expectancy 
count       2928.000000       2928.000000
mean          69.224932         69.224932
std            9.523867          9.523867
min           36.300000         36.300000
25%           63.100000         63.100000
50%           72.100000         72.100000
75%           75.700000         75.700000
max           89.000000         89.000000
Statistics for columns: Adult Mortality
       Adult Mortality  Adult Mortality
count      2928.000000      2928.000000
mean        164.796448       164.796448
std         124.292079       124.292079
min           1.000000         1.000000
25%          74.000000        74.000000
50%         144.000000       144.000000
75%         228.000000       228.000000
max         723.000000       723.000000
Statistics for columns: Alcohol
           Alcohol      Alcohol
count  2744.000000  2744.000000
mean      4.602861     4.602861
std       4.052413     4.052413
min       0.010000   