In [1]:
#Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
from scipy.stats import linregress
import os

In [2]:
# read in csv
read_file = os.path.join("Resources", "life_expectancy_data_region.csv")
life = pd.read_csv(read_file)
life

Unnamed: 0,Country,Year,Region,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Eastern Mediterranean,Developing,65.0,263.0,62,0.01,71.279624,65.0,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Eastern Mediterranean,Developing,59.9,271.0,64,0.01,73.523582,62.0,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Eastern Mediterranean,Developing,59.9,268.0,66,0.01,73.219243,64.0,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9
3,Afghanistan,2012,Eastern Mediterranean,Developing,59.5,272.0,69,0.01,78.184215,67.0,...,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Eastern Mediterranean,Developing,59.2,275.0,71,0.01,7.097109,68.0,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Africa,Developing,44.3,723.0,27,4.36,0.000000,68.0,...,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2
2934,Zimbabwe,2003,Africa,Developing,44.5,715.0,26,4.06,0.000000,7.0,...,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5
2935,Zimbabwe,2002,Africa,Developing,44.8,73.0,25,4.43,0.000000,73.0,...,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0
2936,Zimbabwe,2001,Africa,Developing,45.3,686.0,25,1.72,0.000000,76.0,...,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8


In [3]:
country_count = life["Country"].nunique()
print(f"There are a total of {country_count} unique countries in our dataset.")
region = life["Region"].nunique()
print(f"The {country_count} countries are sorted into {region} different regions, as defined by the WHO.")

There are a total of 193 unique countries in our dataset.
The 193 countries are sorted into 6 different regions, as defined by the WHO.


In [4]:
# get index of df to easily select columns to filter and help with renaming
life.columns

Index(['Country', 'Year', 'Region', 'Status', 'Life expectancy ',
       'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure',
       'Hepatitis B', 'Measles ', ' BMI ', 'under-five deaths ', 'Polio',
       'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')

In [6]:
# rename columns
life_df = life.rename(columns= {"Life expectancy ": "Life Expectancy",
                                "Total expenditure": "Total Expenditure (%)",
                                  })
life_df

Unnamed: 0,Country,Year,Region,Status,Life Expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,...,Polio,Total Expenditure (%),Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Eastern Mediterranean,Developing,65.0,263.0,62,0.01,71.279624,65.0,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Eastern Mediterranean,Developing,59.9,271.0,64,0.01,73.523582,62.0,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Eastern Mediterranean,Developing,59.9,268.0,66,0.01,73.219243,64.0,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9
3,Afghanistan,2012,Eastern Mediterranean,Developing,59.5,272.0,69,0.01,78.184215,67.0,...,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Eastern Mediterranean,Developing,59.2,275.0,71,0.01,7.097109,68.0,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Africa,Developing,44.3,723.0,27,4.36,0.000000,68.0,...,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2
2934,Zimbabwe,2003,Africa,Developing,44.5,715.0,26,4.06,0.000000,7.0,...,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5
2935,Zimbabwe,2002,Africa,Developing,44.8,73.0,25,4.43,0.000000,73.0,...,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0
2936,Zimbabwe,2001,Africa,Developing,45.3,686.0,25,1.72,0.000000,76.0,...,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8


In [7]:
life_df.columns

Index(['Country', 'Year', 'Region', 'Status', 'Life Expectancy',
       'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure',
       'Hepatitis B', 'Measles ', ' BMI ', 'under-five deaths ', 'Polio',
       'Total Expenditure (%)', 'Diphtheria ', ' HIV/AIDS', 'GDP',
       'Population', ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')

In [9]:
# trim dataset to focus on the impact Total Expenditure has on the Life Expectancy by country 
trim_df = life_df[['Region', 'Country', 'Year', 'Status', 'Life Expectancy',
              'Total Expenditure (%)']]
trim_df

Unnamed: 0,Region,Country,Year,Status,Life Expectancy,Total Expenditure (%)
0,Eastern Mediterranean,Afghanistan,2015,Developing,65.0,8.16
1,Eastern Mediterranean,Afghanistan,2014,Developing,59.9,8.18
2,Eastern Mediterranean,Afghanistan,2013,Developing,59.9,8.13
3,Eastern Mediterranean,Afghanistan,2012,Developing,59.5,8.52
4,Eastern Mediterranean,Afghanistan,2011,Developing,59.2,7.87
...,...,...,...,...,...,...
2933,Africa,Zimbabwe,2004,Developing,44.3,7.13
2934,Africa,Zimbabwe,2003,Developing,44.5,6.52
2935,Africa,Zimbabwe,2002,Developing,44.8,6.53
2936,Africa,Zimbabwe,2001,Developing,45.3,6.16


In [10]:
# get count for each column in the trimmed df
trim_df.count()

Region                   2938
Country                  2938
Year                     2938
Status                   2938
Life Expectancy          2928
Total Expenditure (%)    2712
dtype: int64

In [11]:
trim_df = trim_df.dropna()

In [12]:
trim_df.count()

Region                   2702
Country                  2702
Year                     2702
Status                   2702
Life Expectancy          2702
Total Expenditure (%)    2702
dtype: int64

In [None]:
# agg_test = trim_df[['Region', ]]

In [14]:
# remove rows where = Total Expenditure (%)
spending_df = trim_df.loc[trim_df["Total Expenditure (%)"] >0]
spending_df

Unnamed: 0,Region,Country,Year,Status,Life Expectancy,Total Expenditure (%)
0,Eastern Mediterranean,Afghanistan,2015,Developing,65.0,8.16
1,Eastern Mediterranean,Afghanistan,2014,Developing,59.9,8.18
2,Eastern Mediterranean,Afghanistan,2013,Developing,59.9,8.13
3,Eastern Mediterranean,Afghanistan,2012,Developing,59.5,8.52
4,Eastern Mediterranean,Afghanistan,2011,Developing,59.2,7.87
...,...,...,...,...,...,...
2933,Africa,Zimbabwe,2004,Developing,44.3,7.13
2934,Africa,Zimbabwe,2003,Developing,44.5,6.52
2935,Africa,Zimbabwe,2002,Developing,44.8,6.53
2936,Africa,Zimbabwe,2001,Developing,45.3,6.16


In [18]:
spending_mean = spending_df.groupby(["Region"]).mean()["Total Expenditure (%)"]
spending_med = spending_df.groupby(["Region"]).median()["Total Expenditure (%)"]
spending_var = spending_df.groupby(["Region"]).var()["Total Expenditure (%)"]
spending_sd = spending_df.groupby(["Region"]).std()["Total Expenditure (%)"]
spending_sem = spending_df.groupby(["Region"]).sem()["Total Expenditure (%)"]

spending_sumdf = pd.DataFrame({
    "$ Mean": spending_mean,
    "$ Median": spending_med,
    "$ Variance": spending_var,
    "$ Std Dev": spending_sd,
    "$ SEM": spending_sem
})

spending_sumdf

Unnamed: 0_level_0,$ Mean,$ Median,$ Variance,$ Std Dev,$ SEM
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,5.355743,4.97,4.644868,2.155196,0.081869
Americas,6.667253,6.31,5.569767,2.360035,0.106076
Eastern Mediterranean,4.950034,4.6,4.482303,2.117145,0.122643
Europe,6.900732,7.13,6.038778,2.457393,0.089672
South-East Asia,4.053667,3.755,4.441943,2.107592,0.172084
Western Pacific,5.542444,4.87,6.692235,2.586935,0.145757


In [19]:
le_mean = spending_df.groupby(["Region"]).mean()["Life Expectancy"]
le_med = spending_df.groupby(["Region"]).median()["Life Expectancy"]
le_var = spending_df.groupby(["Region"]).var()["Life Expectancy"]
le_sd = spending_df.groupby(["Region"]).std()["Life Expectancy"]
le_sem = spending_df.groupby(["Region"]).sem()["Life Expectancy"]

le_sumdf = pd.DataFrame({
    "LE Mean": le_mean,
    "LE Median": le_med,
    "LE Variance": le_var,
    "LE Std Dev": le_sd,
    "LE SEM": le_sem
})

le_sumdf

Unnamed: 0_level_0,LE Mean,LE Median,LE Variance,LE Std Dev,LE SEM
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,57.204185,56.7,54.475893,7.380779,0.280373
Americas,73.378384,73.8,19.667973,4.434859,0.199332
Eastern Mediterranean,70.493624,73.0,34.491441,5.872941,0.34021
Europe,76.033555,76.3,31.219806,5.587469,0.20389
South-East Asia,68.419333,67.65,19.72157,4.440897,0.362598
Western Pacific,72.097143,72.2,46.55652,6.823234,0.384446


In [20]:
le_vs_spending = pd.merge(spending_sumdf, le_sumdf, how = "outer", on = "Region")
le_vs_spending

Unnamed: 0_level_0,$ Mean,$ Median,$ Variance,$ Std Dev,$ SEM,LE Mean,LE Median,LE Variance,LE Std Dev,LE SEM
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Africa,5.355743,4.97,4.644868,2.155196,0.081869,57.204185,56.7,54.475893,7.380779,0.280373
Americas,6.667253,6.31,5.569767,2.360035,0.106076,73.378384,73.8,19.667973,4.434859,0.199332
Eastern Mediterranean,4.950034,4.6,4.482303,2.117145,0.122643,70.493624,73.0,34.491441,5.872941,0.34021
Europe,6.900732,7.13,6.038778,2.457393,0.089672,76.033555,76.3,31.219806,5.587469,0.20389
South-East Asia,4.053667,3.755,4.441943,2.107592,0.172084,68.419333,67.65,19.72157,4.440897,0.362598
Western Pacific,5.542444,4.87,6.692235,2.586935,0.145757,72.097143,72.2,46.55652,6.823234,0.384446


In [None]:
global_df = spending_df.groupby(['Region','Country']).mean()
global_df = global_df[['Life Expectancy (yrs)', 'Total Expenditure (%)']]
global_df = global_df.reset_index()
global_df = global_df.rename(columns= {'Life Expectancy (yrs)' : 'Life Expectancy'})
global_df.head()

In [None]:
global_df.agg({'Life Expectancy':['min', 'max', 'mean', 'median', 'std', 'var'],
               'Total Expenditure (%)':['min', 'max', 'mean', 'median', 'std', 'var']})


In [None]:
plt.figure(figsize = (15,10))
sns.scatterplot(data=global_df, x="Total Expenditure (%)", y="Life Expectancy", hue="Region")
plt.title("Average Life Expectancy (yrs) vs. Government Health Expenditure (%) (2000-15)", fontweight="bold")
plt.xlabel("Government Health Expenditure (% total budget)")
plt.ylabel("Life Expectancy (years)")

# perform regression on life expectancy vs health expenditure in developing nations
(slope, intercept, rvalue, pvalue, stderr) = linregress(global_df["Total Expenditure (%)"],global_df["Life Expectancy"])
slope = round(slope, 2)
intercept = round(intercept, 2)
rvalue = round(rvalue, 2)
regress_values = (global_df["Total Expenditure (%)"]* slope) + intercept
plt.plot(global_df["Total Expenditure (%)"], regress_values, "r-")
line_eq = f"y = {slope}*x + {intercept}"
plt.annotate(line_eq,(14,62),color="red", fontweight = "bold")
r_eq = f"r = {rvalue}"
plt.annotate(r_eq, (14, 65), color="red", fontweight = "bold")

# plt.savefig("Images/DevelopedNations.png")
plt.show()

In [None]:
dev_df = spending_df.loc[spending_df["Status"] == "Developed"]
dev_df

In [None]:
dev_grp = dev_df.groupby(['Region','Country']).mean()
dev_grp = dev_grp[['Life Expectancy (yrs)', 'Total Expenditure (%)']]
dev_grp = dev_grp.reset_index()
dev_grp = dev_grp.rename(columns= {'Life Expectancy (yrs)' : 'Life Expectancy'})
dev_grp.head()

In [None]:
plt.figure(figsize = (12,10))
sns.scatterplot(data=dev_grp, x="Total Expenditure (%)", y="Life Expectancy", hue="Region")
plt.title("Average Life Expectancy (yrs) vs. Government Health Expenditure (%) in Developing Countries (2000-15)", fontweight="bold")
plt.xlabel("Government Health Expenditure (% total budget)")
plt.ylabel("Life Expectancy (years)")

# perform regression on life expectancy vs health expenditure in developing nations
(slope, intercept, rvalue, pvalue, stderr) = linregress(dev_grp["Total Expenditure (%)"],dev_grp["Life Expectancy"])
slope = round(slope, 2)
intercept = round(intercept, 2)
rvalue = round(rvalue, 2)
regress_values = (dev_grp["Total Expenditure (%)"]* slope) + intercept
plt.plot(dev_grp["Total Expenditure (%)"], regress_values, "r-")
line_eq = f"y = {slope}*x + {intercept}"
plt.annotate(line_eq,(12,74),color="red", fontweight = "bold")
r_eq = f"r = {rvalue}"
plt.annotate(r_eq, (12, 74.5), color="red", fontweight = "bold")

# plt.savefig("Images/DevelopedNations.png")
plt.show()

In [None]:
undev_df = spending_df.loc[spending_df["Status"] == "Developing"]
undev_df

In [None]:
undev_grp = undev_df.groupby(['Region','Country']).mean()
undev_grp = undev_grp[['Life Expectancy (yrs)', 'Total Expenditure (%)']]
undev_grp = undev_grp.reset_index()
undev_grp.head()

In [None]:
undev_grp = undev_grp.rename(columns= {'Life Expectancy (yrs)' : 'Life Expectancy'})

In [None]:
undev_grp['Region'].unique()

In [None]:
undev_grp2 = undev_grp.rename(columns= {'Life Expectancy (yrs)' : 'Life Expectancy'})
undev_grp2 = undev_grp2[['Life Expectancy', 'Total Expenditure (%)']]
undev_grp2

In [None]:
plt.figure(figsize = (15,10))
sns.scatterplot(data=undev_grp, x="Total Expenditure (%)", y="Life Expectancy", hue="Region")
plt.title("Average Life Expectancy (yrs) vs. Government Health Expenditure (%) in Developing Countries (2000-15)", fontweight="bold", fontsize=18)
plt.xlabel("Government Health Expenditure (% total budget)", fontsize=16)
plt.ylabel("Life Expectancy (years)", fontsize=16)

# perform regression on life expectancy vs health expenditure in developing nations
(slope, intercept, rvalue, pvalue, stderr) = linregress(undev_grp["Total Expenditure (%)"],undev_grp["Life Expectancy"])
slope = round(slope, 2)
intercept = round(intercept, 2)
rvalue = round(rvalue, 2)
regress_values = (undev_grp["Total Expenditure (%)"]* slope) + intercept
plt.plot(undev_grp["Total Expenditure (%)"], regress_values, "r-")
line_eq = f"y = {slope}*x + {intercept}"
plt.annotate(line_eq,(9,50),color="red", fontweight = "bold")
r_eq = f"r = {rvalue}"
plt.annotate(r_eq, (9, 45), color="red", fontweight = "bold")

plt.savefig("Images/Developed_1.png")
plt.show()


In [None]:
undev_df["Region"].unique()

In [None]:
life_exp = undev_grp['Life Expectancy (yrs)']
expen = undev_grp['Total Expenditure (%)']
plt.scatter(life_exp, expen)
plt.xlabel("Life Expectancy (yrs)")
plt.ylabel("Average Health Expenditure (%)")
plt.show()

In [None]:
import seaborn as sns
plt.figure(figsize = (15,10))
sns.scatterplot(data=undev_grp, x ="Life Expectancy", y="Total Expenditure (%)", hue="Region", palette="deep")
plt.title("Life Expectancy (yrs) vs. Government Health Expenditure (%) in Developing Countries", fontweight="bold")
plt.xlabel("Government Health Expenditure (% total budget)")
plt.ylabel("Life Expectancy (years)")

In [None]:
dev_grp = dev_df.groupby(['Region', 'Country']).mean()
dev_grp = dev_grp[['Life Expectancy (yrs)', 'Total Expenditure (%)']]
dev_grp

In [None]:
undev_grp["Region"].unique()

In [None]:
dev_df["Region"].unique()

In [None]:
dev_df["Country"].unique()

In [None]:
undev_df["Region"].unique()

In [None]:
names = undev_df["Country"].unique()
count = undev_df["Country"].nunique()
print(names, count)

In [None]:
import seaborn as sns
spending_df.head()

In [None]:
# look at one developing nation through the years
malawi_df = undev_df.loc[undev_df["Country"] == "Malawi"]
malawi_df

In [None]:
# plot gov expenditure and life expectancy over the years for Malawi (developing country)
fig, ax = plt.subplots(figsize =(8,6))

# twin object for dual y-axes on the same plot 
ax2 = ax.twinx()

# plot first y axis
line1 = ax.plot(malawi_df["Year"], malawi_df["Life Expectancy (yrs)"], color = "blue", marker = "x", label = "Life Expectancy")

#set x axis label
ax.set_xlabel("Year")
# set first y axis label 
ax.set_ylabel("Life Expectancy (yrs)")

# plot second y axis
line2 = ax2.plot(malawi_df["Year"], malawi_df["Total Expenditure (%)"], color = "red", marker = "o", label="Expenditure")
# set second y axis label
ax2.set_ylabel("Total Expenditure (%)")

ax.set_title("Malawi Government Health Spending & Life Expectancy (2000-14)")

ax.legend()
ax2.legend(loc = "upper center")

#save fig and display fig
plt.savefig("Catie_Images/MalawiExpenditure.png")
plt.tight_layout()
plt.show()


In [None]:
china_df = undev_df.loc[undev_df["Country"] == "China"]
china_df 

In [None]:
# plot gov expenditure and life expectancy over the years for China (developing country)
fig, ax = plt.subplots(figsize =(8,6))

# twin object for dual y-axes on the same plot 
ax2 = ax.twinx()

# plot first y axis
line1 = ax.plot(china_df["Year"], china_df["Life Expectancy (yrs)"], color = "blue", marker = "x", label = "Life Expectancy")

#set x axis label
ax.set_xlabel("Year")
# set first y axis label 
ax.set_ylabel("Life Expectancy (yrs)")

# plot second y axis
line2 = ax2.plot(china_df["Year"], china_df["Total Expenditure (%)"], color = "red", marker = "o", label="Expenditure")
# set second y axis label
ax2.set_ylabel("Total Expenditure (%)")

ax.set_title("Chinese Government Health Spending & Life Expectancy (2000-14)")

ax.legend()
ax2.legend(loc = "upper center")

#save fig and display fig
plt.savefig("Catie_Images/ChinaExpenditure.png")
plt.tight_layout()
plt.show()


In [None]:
japan_df = dev_df.loc[dev_df["Country"] == "Japan"]
japan_df 

In [None]:
# plot gov expenditure and life expectancy over the years for Japan (developed country)
fig, ax = plt.subplots(figsize =(8,6))

# twin object for dual y-axes on the same plot 
ax2 = ax.twinx()

# plot first y axis
line1 = ax.plot(japan_df["Year"], japan_df["Life Expectancy (yrs)"], color = "blue", marker = "x", label = "Life Expectancy")

#set x axis label
ax.set_xlabel("Year")
# set first y axis label 
ax.set_ylabel("Life Expectancy (yrs)")

# plot second y axis
line2 = ax2.plot(japan_df["Year"], japan_df["Total Expenditure (%)"], color = "red", marker = "o", label="Expenditure")
# set second y axis label
ax2.set_ylabel("Total Expenditure (%)")

ax.set_title("Japanese Government Health Spending & Life Expectancy (2000-14)")

ax.legend()
ax2.legend(loc = "upper center")

#save fig and display fig
plt.savefig("Catie_Images/JapanExpenditure.png")
plt.tight_layout()
plt.show()
