In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

In [None]:
life_expectancy_path = "../life_expectancy.csv"

life_expectancy_data = pd.read_csv(life_expectancy_path)

life_expectancy_data

In [None]:
style_dict = {
    "Mean": "{:.2f}", 
    "Median": "{:.2f}",
    "Variance": "{:.2f}", 
    "Standard Deviation": "{:.2f}",
    "Standard Error": "{:.2f}", 
    "mean": "{:.2f}",
    "median": "{:.2f}",
    "var": "{:.2f}",
    "std": "{:.2f}",
    "sem":"{:.2f}",
    'GDP': "{:.3f}",
    "percentage expenditure": "{:.2f}",
}

In [None]:
life_exp_df = life_expectancy_data.copy()

In [None]:
life_exp_df.columns

In [None]:
len(life_exp_df["Country"].unique())

# 193 Countries

In [None]:
reduced_df = life_exp_df[['Country', 'Year', 'Status', 'Life expectancy ',
       'Alcohol', ' BMI ', 'Total expenditure','GDP', 'Population',
       'Income composition of resources', 'Schooling']].copy()

reduced_df.columns = [col.strip() for col in reduced_df.columns]

#reduced_df

In [None]:
drop_countries = ['San Marino', 'Marshall Islands', 'Cook Islands', 'Saint Kitts and Nevis', 'Niue', 'Palau', 'Dominica', 'Monaco', 'Nauru',
                  'Tuvalu']

for country in drop_countries:
    reduced_df.drop(reduced_df[reduced_df['Country'] == country].index, inplace=True)
    
#reduced_df['Country'].value_counts() #all 16
#183 countries

In [None]:
## Dropped 10 Countries - missing life expectancy data
#### San Marino, Marshall Islands, Cook Islands, Saint Kitts and Nevis, Niue, Palau, Dominica, Monaco, Nauru, Tuvalu

In [None]:
reduced_df.groupby("Country")['Schooling'].count()

#183 countries

In [None]:
school_df = reduced_df[['Country', 'Year','Status','Life expectancy','BMI', 'GDP', 'Schooling']]

school_df

In [None]:
drop_countries_schooling = ['Czechia',"Côte d'Ivoire","Democratic People's Republic of Korea", 
                            "Democratic Republic of the Congo", "Republic of Korea", "Republic of Moldova", "Somalia",
                            "United Kingdom of Great Britain and Northern Ireland","United Republic of Tanzania",
                            "United States of America"]

for country in drop_countries_schooling:
    school_df.drop(school_df[school_df['Country'] == country].index, inplace=True)

In [None]:
school_df.groupby("Country")['Schooling'].count()

#173 countries

## "Fig. #: Does More School = More Life?"

In [None]:
fig = plt.figure(figsize =(12, 6))

school_mean = school_df.groupby("Country")["Schooling"].mean()
school_mean

life_exp_mean = school_df.groupby("Country")["Life expectancy"].mean()
life_exp_mean

correlation_school_life = st.pearsonr(life_exp_mean,school_mean)
print(f"The correlation between average years in school and average life expectancy is {round(correlation_school_life[0],2)}")


x_values = school_mean
y_values = life_exp_mean
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="purple")
plt.xlabel("Average Schooling (years)")
plt.ylabel("Average Life Expectancy (years)")
plt.title("Fig. 1: Does More School = More Life?")

## "Fig. #: Does More Development = More School?"

In [None]:
school_mean = school_df.groupby("Country")["Schooling"].mean()
school_mean

life_exp_mean = school_df.groupby("Country")["Life expectancy"].mean()
life_exp_mean

Life = list(school_df.groupby("Status")["Schooling"].mean())
School = list(school_df.groupby("Status")["Life expectancy"].mean())
index = ['Developed', 'Developing']
df = pd.DataFrame({'Years of Life': Life,
                    'Years of Schooling': School}, index=index)

ax = df.plot.barh()
ax