In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from ipywidgets import interact, interactive, interact_manual
import ipywidgets as w

In [None]:
# something is bugged
# this can't be together with imports: https://github.com/jupyter/notebook/issues/3385
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300.0 # default: 72.0
mpl.rcParams['figure.figsize'] = [8., 4.5] # default: [6., 4.]

In [None]:
df_tmp = dict()

# Project

Life expectancy

http://gapm.io/ilex

In [None]:
df_tmp['life_expectancy'] = pd.read_csv('life_expectancy_years.csv', index_col=[0])

Population

http://gapm.io/dpop

In [None]:
df_tmp['population'] = pd.read_csv('population_total.csv', index_col=[0])

GDP per capita (inflation adjusted)

http://gapm.io/dgdppc

In [None]:
df_tmp['income'] = pd.read_csv('income_per_person_gdppercapita_ppp_inflation_adjusted.csv', index_col=[0])

Surface area

https://data.worldbank.org/indicator/AG.SRF.TOTL.K2

In [None]:
df_tmp['area'] = pd.read_csv('surface_area_sq_km.csv', index_col=[0])

CO2 emissions (tonnes per person)

https://mdgs.un.org/unsd/mdg/Data.aspx

In [None]:
#df_tmp['co2'] = pd.read_csv('co2_emissions_tonnes_per_person.csv', index_col=[0])

Access to basic safe water source (percent of population)

https://cdiac.ess-dive.lbl.gov/

In [None]:
df_tmp['h2o'] = pd.read_csv('at_least_basic_water_source_overall_access_percent.csv', index_col=[0]) / 100

Current health expenditure (percent of GDP)

https://data.worldbank.org/indicator/SH.XPD.CHEX.GD.ZS

In [None]:
df_tmp['exp_health'] = pd.read_csv('sh_xpd_chex_gd_zs.csv', index_col=[0]) / 100

Infant mortality rate (death between birth and age 1 per 1000 live births)

https://data.worldbank.org/indicator/SP.DYN.IMRT.IN

In [None]:
df_tmp['infant_mortality'] = pd.read_csv('sp_dyn_imrt_in.csv', index_col=[0])

Literacy rate, adult total (percent of people ages 15 and above)

https://data.worldbank.org/indicator/SE.ADT.LITR.ZS

In [None]:
df_tmp['literacy'] = pd.read_csv('se_adt_litr_zs.csv', index_col=[0]) / 100

Polio immunization coverage among 1-year-olds (%)

https://apps.who.int/gho/data/node.imr.WHS4_544

In [None]:
df_tmp['polio'] = pd.read_csv(
    'WHS4_544.csv',
    header=[1], skiprows=0, skipinitialspace=True, index_col=[0]
).iloc[:, ::-1]
df_tmp['polio'].rename(columns=lambda x: x.strip(), inplace=True)

BCG immunization coverage among 1-year-olds (%)

https://apps.who.int/gho/data/node.imr.WHS4_543

In [None]:
df_tmp['bcg'] = pd.read_csv(
    'WHS4_543.csv',
    header=[1], skiprows=0, skipinitialspace=True, index_col=[0]
).iloc[:, ::-1]
df_tmp['bcg'].rename(columns=lambda x: x.strip(), inplace=True)

Hepatitis B (HepB3) immunization coverage among 1-year-olds (%)

https://apps.who.int/gho/data/node.imr.WHS4_117

In [None]:
df_tmp['hepB'] = pd.read_csv(
    'WHS4_117.csv',
    header=[1], skiprows=0, skipinitialspace=True, index_col=[0]
).iloc[:, ::-1]
df_tmp['hepB'].rename(columns=lambda x: x.strip(), inplace=True)

Alcohol consumption per capita in litres of pure alcohol

In [None]:
#alcohol = pd.read_csv(
#    'Recorded alcohol per capita consumption.csv',
#    header=[1], skiprows=[], skipinitialspace=True, index_col=[0,1,2]
#).iloc[:, ::-1]
#df_tmp['alcohol'] = alcohol.xs(' All types', level=2).droplevel(level=1).rename(columns=lambda x: x.strip()).drop_duplicates()

In [None]:
# clean up
for x in df_tmp.values():
    x.replace('', np.nan, inplace=True)
    # drop all data
    x.drop(x.columns.to_series()[:'1950'], axis=1, inplace=True)
    # drop predictions
    x.drop(x.columns.to_series()['2019':], axis=1, inplace=True)
    # rename countries, because we use different data sets
    x.rename(index={
        'Czech Republic': 'Czechia',
        'United States': 'United States of America',
        'Iran (Islamic Republic of)': 'Iran',
        'Vietnam': 'Viet Nam',
        'Russian Federation': 'Russia',
        'Bolivia (Plurinational State of)': 'Bolivia',
        'Democratic Republic of the Congo': 'Congo, Dem. Rep.',
        'Congo, Rep.': 'Congo',
        'Syrian Arab Republic': 'Syria',
        'Venezuela (Bolivarian Republic of)': 'Venezuela',
        'Slovak Republic': 'Slovakia',
        'Democratic People\'s Republic of Korea': 'North Korea',
        'Republic of Korea': 'South Korea',
        'Saint Kitts and Nevis': 'St. Kitts and Nevis',
        'Micronesia (Federated States of)': 'Micronesia, Fed. Sts.',
        'Micronesia': 'Micronesia, Fed. Sts',
        'Brunei Darussalam': 'Brunei',
        'Cote d\'Ivoire': 'Côte d\'Ivoire',
        'Saint Lucia': 'St. Lucia',
        'Saint Vincent and the Grenadines': 'St. Vincent and the Grenadines',
        'Republic of Moldova': 'Moldova',
        'Republic of North Macedonia': 'Macedonia, FYR',
        'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
        'Unted Kingdom': 'United Kingdom',
        'United Republic of Tanzania': 'Tanzania',
        'Swaziland': 'Eswatini',
        'Lao People\'s Democratic Republic': 'Lao',
        'Cabo Verde': 'Cape Verde',
        'Kyrgyz Republic': 'Kyrgyzstan'
    }, inplace=True)

# join data into one big table
df = pd.concat(df_tmp.values(), axis=1, keys=df_tmp.keys(), sort=False)

# interpolate missing points between other values
df.interpolate(method='linear', limit_area='inside', inplace=True)
# fill rest with column means
df.fillna(df.mean(), inplace=True)

years = [x for x in range(2000, 2019) if len(df.xs(str(x), axis=1, level=1).columns) == len(df_tmp)]
print('We have complete set of data for years:')
print(years)

df_max = max(years)
df_latest = df.xs(str(df_max), axis=1, level=1)
print(f'Picking latest: {df_max}')

years = list(map(str, years))

In [None]:
df_latest.describe()

In [None]:
df_latest.loc['Poland']

In [None]:
%matplotlib notebook

fig, ax = plt.subplots()

tmp = df_latest.loc[df_latest['population'].notnull()]

pop = tmp['population']
pop = 1024 * (pop  - pop.min()) / (pop.max() - pop.min())

sc = ax.scatter(tmp['life_expectancy'], tmp['income'], s=pop, alpha=0.5)

ax.set_xlabel(r'Health Expenditure (% of GDP)', fontsize=15)
ax.set_ylabel(r'Life Expectancy', fontsize=15)

ax.grid(True)
fig.tight_layout()

annot = ax.annotate("", xy=(0,0), xytext=(0,20),textcoords="offset points",
                    bbox=dict(boxstyle="round", fc="w"),
                    arrowprops=dict(arrowstyle="->"))
annot.set_visible(False)

def update_annot(ind):
    pos = sc.get_offsets()[ind["ind"][0]]
    annot.xy = pos
    text = ", ".join([tmp.index[n] for n in ind["ind"]])
    annot.set_text(text)
    #annot.get_bbox_patch().set_facecolor(cmap(norm(c[ind["ind"][0]])))
    annot.get_bbox_patch().set_alpha(0.4)

def hover(event):
    vis = annot.get_visible()
    if event.inaxes == ax:
        cont, ind = sc.contains(event)
        if cont:
            update_annot(ind)
            annot.set_visible(True)
            fig.canvas.draw_idle()
        else:
            if vis:
                annot.set_visible(False)
                fig.canvas.draw_idle()

fig.canvas.mpl_connect("motion_notify_event", hover)

plt.show()

In [None]:
%matplotlib inline

In [None]:
fig, ax = plt.subplots()

corr = df_latest.corr()['life_expectancy']
corr.plot.bar()

plt.tight_layout()
plt.show()

In [None]:
#tmp = df.loc[:, ('population', years)].droplevel(level=0, axis=1)
#display(tmp)
#display(df.loc[df.loc[:, ('life_expectancy', years)].idxmin(), ('life_expectancy', years)])

In [None]:
inputs = ['h2o', 'income', 'exp_health', 'literacy', 'polio', 'bcg', 'hepB']

x = df_latest[inputs].values
y = df_latest['life_expectancy'].values

#x = np.concatenate([df.loc[:, (inputs, i)].values for i in years])
#y = np.concatenate([df.loc[:, ('life_expectancy', i)].values for i in years])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print("Mean squared error equals: {0}".format(mean_squared_error(y_pred, y_test)))

In [None]:
h2o=w.FloatSlider(
    min=0.0, max=1.0, step=0.01,
    continuous_update=False,
    value=df_latest['h2o'].mean(),
    description='Access to fresh water'
)
income=w.IntSlider(
    min=0, max=df_latest['income'].max() * 1.5,
    continuous_update=False,
    value=df_latest['income'].mean(),
    description='GDP'
)
exp_health=w.FloatSlider(
    min=0, max=1.0, step=0.01,
    continuous_update=False,
    value=df_latest['exp_health'].mean(),
    description='Health expenditure (% GDP)'
)
literacy=w.FloatSlider(
    min=0.0, max=1.0, step=0.01,
    continuous_update=False,
    value=df_latest['literacy'].mean(),
    description='Literacy'
)
polio=w.FloatSlider(
    min=0.0, max=1.0, step=0.1,
    continuous_update=False,
    value=df_latest['polio'].mean(),
    description='Polio immunization'
)
bcg=w.FloatSlider(
    min=0.0, max=1.0, step=0.1,
    continuous_update=False,
    value=df_latest['bcg'].mean(),
    description='BCG immunization'
)
hepB=w.FloatSlider(
    min=0.0, max=1.0, step=0.1,
    continuous_update=False,
    value=df_latest['hepB'].mean(),
    description='HepB immunization'
)
country=w.Dropdown(
    options=df_latest.index,
    continuous_update=False,
    value='Poland'
)
year=w.IntSlider(
    min=min(years), max=max(years),
    continuous_update=False,
    value=max(years)
)

@interact(country=country, year=year)
def country_selector(country, year):
    #year = str(year)
    data = df.loc[country].xs(str(year), level=1)
    h2o.value = data['h2o']
    income.value = data['income']
    exp_health.value = data['exp_health']
    literacy.value = data['literacy']
    polio.value = data['polio']
    bcg.value = data['bcg']
    hepB.value = data['hepB']

@interact(
    h2o=h2o,
    income=income,
    exp_health=exp_health,
    literacy=literacy,
    polio=polio,
    bcg=bcg,
    hepB=hepB
)
def predict(**kwargs):
    tmp = np.array([[*kwargs.values()]])
    tmp = np.array([[kwargs[x] for x in inputs]])
    prediction = model.predict(tmp)[0]
    display("%.2f" % prediction)