## CMSC320 Final - Analysis of Life Expectancy Data
### Evan Nadelbach and Joseph Zietowski

In [87]:
import pandas as pd
import re
pd.set_option('display.max_rows', 1000000)

In [88]:
# read in the life expectancy data
# https://apps.who.int/gho/data/view.main.SDG2016LEXv?lang=en
df = pd.read_csv("data/life_expectancy.csv")
df.head()

Unnamed: 0,Country,Year,Life expectancy at birth (years) (Both sexes),Life expectancy at birth (years) (Male),Life expectancy at birth (years) (Female),Life expectancy at age 60 (years) (Both sexes),Life expectancy at age 60 (years) (Male),Life expectancy at age 60 (years) (Female),Healthy life expectancy (HALE) at birth (years) (Both sexes),Healthy life expectancy (HALE) at birth (years) (Male),Healthy life expectancy (HALE) at birth (years) (Female),Healthy life expectancy (HALE) at age 60 (years) (Both sexes),Healthy life expectancy (HALE) at age 60 (years) (Male),Healthy life expectancy (HALE) at age 60 (years) (Female)
0,Afghanistan,2016,62.7,61.0,64.5,16.3,15.5,17.1,53.0,52.1,54.1,11.3,10.9,11.7
1,Afghanistan,2015,63.2,61.8,64.7,16.3,15.5,17.1,53.2,52.6,54.1,11.2,10.8,11.6
2,Afghanistan,2014,63.0,61.7,64.4,16.2,15.4,17.0,,,,,,
3,Afghanistan,2013,62.7,61.5,64.1,16.2,15.4,16.9,,,,,,
4,Afghanistan,2012,62.2,60.9,63.6,16.1,15.3,16.8,,,,,,


In [89]:
# remove the unwanted columns
df = df[["Country", "Year", "Life expectancy at birth (years) (Both sexes)"]]
# rename the columns
df = df.rename(columns={"Life expectancy at birth (years) (Both sexes)": "Life expectancy"})
# remove the rows that are not needed (year > 2016)
df.drop(df[df.Year >= 2017].index, inplace=True)
df.head()

Unnamed: 0,Country,Year,Life expectancy
0,Afghanistan,2016,62.7
1,Afghanistan,2015,63.2
2,Afghanistan,2014,63.0
3,Afghanistan,2013,62.7
4,Afghanistan,2012,62.2


### Add the BMI data (Age-standardized mean BMI (kg/m^2) 18+ years old)

In [90]:
# read in the data
# https://www.who.int/data/gho/data/indicators/indicator-details/GHO/mean-bmi-(kg-m-)-(age-standardized-estimate)
bmi = pd.read_csv("data/bmi.csv")

# drop the unwanted columns
bmi.drop('Indicator',axis=1,inplace=True)
bmi.drop('Dim2',axis=1,inplace=True)

# drop the unwanted rows
bmi.drop(bmi[bmi.Period < 2000].index, inplace=True)
bmi.drop(bmi[bmi.Dim1 == "Male"].index, inplace=True)
bmi.drop(bmi[bmi.Dim1 == "Female"].index, inplace=True)

# tidy the data in the bmi column
bmi["First Tooltip"].replace(["\[.*\]"], "", inplace=True, regex=True)

# merge the data into our life expectancy dataframe
df = df.merge(bmi, how = "inner", left_on = ["Country","Year"], right_on = ["Location", "Period"])

# remove more unwanted columns
df.drop('Location',axis=1,inplace=True)
df.drop('Period',axis=1,inplace=True)
df.drop('Dim1',axis=1,inplace=True)

# rename the BMI column
df = df.rename(columns={"First Tooltip": "BMI"})

df.head()

Unnamed: 0,Country,Year,Life expectancy,BMI
0,Afghanistan,2016,62.7,23.4
1,Afghanistan,2015,63.2,23.3
2,Afghanistan,2014,63.0,23.2
3,Afghanistan,2013,62.7,23.0
4,Afghanistan,2012,62.2,22.9


### Add the drinking water data 

In [None]:
# read in the data
# https://www.who.int/data/gho/data/indicators/indicator-details/GHO/population-using-safely-managed-drinking-water-services-(-)
drinking_water = pd.read_csv("data/drinking_water.csv")

### Add the government expenditure data

In [None]:
# read in the data
# https://www.who.int/data/gho/data/indicators/indicator-details/GHO/general-government-expenditure-on-health-as-a-percentage-of-total-government-expenditure
government_expenditure = pd.read_csv("data/government_expenditure.csv")

### Add the HIV data

In [None]:
# read in the data
# https://www.who.int/data/gho/data/indicators/indicator-details/GHO/new-hiv-infections-(per-1000-uninfected-population)
HIV = pd.read_csv("data/HIV.csv")

### Add the homicide data

In [None]:
# read in the data
# https://www.who.int/data/gho/data/indicators/indicator-details/GHO/estimates-of-rates-of-homicides-per-100-000-population
homicides = pd.read_csv("data/homicides.csv")

### Add the malaria data

In [None]:
# read in the data
# https://www.who.int/data/gho/data/indicators/indicator-details/GHO/malaria---number-of-reported-confirmed-cases
malaria = pd.read_csv("data/malaria.csv")

### Add the pollution data

In [None]:
# read in the data
# https://www.who.int/data/gho/data/indicators/indicator-details/GHO/gho-phe-primary-reliance-on-clean-fuels-and-technologies-proportion
pollution = pd.read_csv("data/pollution.csv")

### Add the suicide data

In [None]:
# read in the data
# https://www.who.int/data/gho/data/indicators/indicator-details/GHO/age-standardized-suicide-rates-(per-100-000-population)
suicide = pd.read_csv("data/suicide.csv")

### Add the tuberculosis data

In [None]:
# read in the data
# https://www.who.int/data/gho/data/indicators/indicator-details/GHO/deaths-due-to-tuberculosis-among-hiv-negative-people-(per-100-000-population)
tuberculosis = pd.read_csv("data/tuberculosis.csv")

### Add the under-five deaths data

In [None]:
# read in the data
# https://www.who.int/data/gho/data/indicators/indicator-details/GHO/number-of-under-five-deaths-(thousands)
under_five_deaths = pd.read_csv("data/under-five_deaths.csv")