In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint

from IPython.display import Markdown, display
from IPython.display import HTML

def printmd(string):
    display(Markdown(string))


In [9]:
all_data = pd.read_csv("./data/WDIData.csv")

In [8]:
all_data

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,16.511262,16.966046,17.411737,17.808234,18.179760,18.558937,18.949636,19.333168,,
1,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,32.224027,32.046478,31.323579,33.312163,38.380433,39.754201,42.168241,43.640661,,
2,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,20.525353,19.461383,17.790698,16.553470,23.907897,24.624725,26.813900,28.841150,,
3,Africa Eastern and Southern,AFE,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,,,,,,,...,66.303599,66.496010,65.828988,66.926692,68.722184,71.085418,71.994933,73.589886,,
4,Africa Eastern and Southern,AFE,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383833,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.REFU.ZS,,,,,,,...,,,,14.500000,,,,,,
383834,Zimbabwe,ZWE,Women who were first married by age 15 (% of w...,SP.M15.2024.FE.ZS,,,,,,,...,,,,3.700000,,,,5.418352,,
383835,Zimbabwe,ZWE,Women who were first married by age 18 (% of w...,SP.M18.2024.FE.ZS,,,,,,,...,,,33.500000,32.400000,,,,33.658057,,
383836,Zimbabwe,ZWE,Women's share of population ages 15+ living wi...,SH.DYN.AIDS.FE.ZS,,,,,,,...,58.900000,59.200000,59.400000,59.500000,59.700000,59.900000,60.000000,60.200000,60.4,


In [10]:
snapshot = all_data[["Country Name", "Indicator Name", "1960", "1970", "1980", "1990", "2000",  "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020"]]
snapshot

Unnamed: 0,Country Name,Indicator Name,1960,1970,1980,1990,2000,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Africa Eastern and Southern,Access to clean fuels and technologies for coo...,,,,,11.469146,15.658101,16.097824,16.511262,16.966046,17.411737,17.808234,18.179760,18.558937,18.949636,19.333168,
1,Africa Eastern and Southern,Access to electricity (% of population),,,,,20.086007,28.233373,28.381836,32.224027,32.046478,31.323579,33.312163,38.380433,39.754201,42.168241,43.640661,
2,Africa Eastern and Southern,"Access to electricity, rural (% of rural popul...",,,,,8.491610,16.241064,15.295950,20.525353,19.461383,17.790698,16.553470,23.907897,24.624725,26.813900,28.841150,
3,Africa Eastern and Southern,"Access to electricity, urban (% of urban popul...",,,,,56.083452,62.520824,65.470615,66.303599,66.496010,65.828988,66.926692,68.722184,71.085418,71.994933,73.589886,
4,Africa Eastern and Southern,Account ownership at a financial institution o...,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383833,Zimbabwe,Women who believe a husband is justified in be...,,,,,,,16.900000,,,,14.500000,,,,,
383834,Zimbabwe,Women who were first married by age 15 (% of w...,,,,,,,3.900000,,,,3.700000,,,,5.418352,
383835,Zimbabwe,Women who were first married by age 18 (% of w...,,,,,,,30.500000,,,33.500000,32.400000,,,,33.658057,
383836,Zimbabwe,Women's share of population ages 15+ living wi...,,,,53.2,56.300000,58.500000,58.700000,58.900000,59.200000,59.400000,59.500000,59.700000,59.900000,60.000000,60.200000,60.4


In [None]:
fig, ax = plt.subplots(figsize=(20,10))     
cols = snapshot.columns 
colours = ['#1f497d', '#ffffff'] 
sns.heatmap(snapshot[cols].isnull(), cmap=sns.color_palette(colours))
ax.set_xticklabels(ax.get_xticklabels(),rotation =80)
ax.set_title("Missing Data Distribution")

plt.show()

### 2018 - Seems to have the most data for the more current years

In [12]:
data_2018 = all_data[["Country Name", "Indicator Name", "2018"]]
data = data_2018.rename(columns={'Country Name': 'Country', 'Indicator Name': 'Indicator'})
num_countries = data.Country.nunique()
printmd(f"**Number of countries:** {num_countries}")

**Number of countries:** 266

### Possible indicators for us to use in the project.

For specifics about the indicators, go here: 
https://datacatalog.worldbank.org/search/dataset/0037712/World-Development-Indicators

I tried to use indicators that were percentage of population or per 1,000, so it's relative rather than absolute.

In [132]:
# Economic and access (water, electricity, fuel for cooking) indicators 
cooking_tech =  data[data.Indicator.str.contains("cooking")].reset_index(drop=True)
access_electricity = data[data.Indicator.str.contains("Access to electricity \(% of population\)")].reset_index(drop=True)
income_per_capita = data[data.Indicator.str.contains("net national income per capita")].reset_index(drop=True)
education_expenditure_prim = data[data.Indicator.str.contains("Current education expenditure, primary")].reset_index(drop=True)
drinking_water_services  = data[data.Indicator.str.contains("People using at least basic drinking water services \(% of population")].reset_index(drop=True)
basic_sanitation = data[data.Indicator.str.contains("People using at least basic sanitation services \(% of population\)")].reset_index(drop=True)
nurses_midwives = data[data.Indicator.str.contains("Nurses and midwives")].reset_index(drop=True)
physicians = data[data.Indicator.str.contains("Physicians")].reset_index(drop=True)
undernourishment = data[data.Indicator.str.contains("Prevalence of undernourishment \(% of population")].reset_index(drop=True)
sev_food_insecurity_all = data[data.Indicator.str.contains("Prevalence of severe food insecurity in the population")].reset_index(drop=True)
mod_sev_food_insecurity_all = data[data.Indicator.str.contains("moderate or severe food insecurity in the population")].reset_index(drop=True)
births_skilled_staff = data[data.Indicator.str.contains("Births attended by skilled health staff \(% of total\)")].reset_index(drop=True)

# health indicators
adolescent_fertility_rate = data[data.Indicator.str.contains("Adolescent fertility rate")].reset_index(drop=True)
adolescents_out_of_school = data[data.Indicator.str.contains("Adolescents out of school \(% of lower secondary school age\)")].reset_index(drop=True)
health_expediture = data[data.Indicator.str.contains("Current health expenditure per capita \(current")].reset_index(drop=True)
fertility_rate = data[data.Indicator.str.contains("Fertility rate, total \(births per woman\)")].reset_index(drop=True)
measles_immunizaton = data[data.Indicator.str.contains("measles")].reset_index(drop=True)
life_expectancy_at_birth_female = data[data.Indicator.str.contains("birth, female")].reset_index(drop=True)
life_expectancy_at_birth_male = data[data.Indicator.str.contains("birth, male")].reset_index(drop=True)
life_expectancy_at_birth_total = data[data.Indicator.str.contains("birth, total")].reset_index(drop=True)
mortality_rate_infant = data[data.Indicator.str.contains("Mortality rate, infant \(")].reset_index(drop=True)
mortality_rate_infant_male = data[data.Indicator.str.contains("Mortality rate, infant, male")].reset_index(drop=True)
mortality_rate_infant_female = data[data.Indicator.str.contains("Mortality rate, infant, female")].reset_index(drop=True)
mortality_rate_under_5 = data[data.Indicator.str.contains("Mortality rate, under-5 \(")].reset_index(drop=True)
mortality_rate_under_5_female = data[data.Indicator.str.contains("Mortality rate, under-5, female")].reset_index(drop=True)
mortality_rate_under_5_male = data[data.Indicator.str.contains("Mortality rate, under-5, male")].reset_index(drop=True)
anemia_children = data[data.Indicator.str.contains("anemia among childre")].reset_index(drop=True)
stunting_height_age = data[data.Indicator.str.contains("Prevalence of stunting, height for age \(modeled")].reset_index(drop=True)
overweight_children_under_5 = data[data.Indicator.str.contains("overweight \(modeled")].reset_index(drop=True)


# education
children_out_of_school = data[data.Indicator.str.contains("Children out of school")].reset_index(drop=True)
children_out_of_school_prim = data[data.Indicator.str.contains("Children out of school, primary")].reset_index(drop=True)
compulsory_education = data[data.Indicator.str.contains("Compulsory education")].reset_index(drop=True)
lower_secondary_school_starting_age = data[data.Indicator.str.contains("secondary school starting age")].reset_index(drop=True)
primary_edu_duration = data[data.Indicator.str.contains("Primary education, duration \(years]\)")].reset_index(drop=True)
primary_starting_age = data[data.Indicator.str.contains("Primary school starting age \(years\)")].reset_index(drop=True)
probability_dying_10_14 = data[data.Indicator.str.contains("Probability of dying among adolescents ages 10-14 years \(per 1,000\)")].reset_index(drop=True)
probability_dying_15_19 = data[data.Indicator.str.contains("Probability of dying among adolescents ages 15-19 years \(per 1,000\)")].reset_index(drop=True)
probability_dying_5_9 = data[data.Indicator.str.contains("Probability of dying among children ages 5-9 years \(per 1,000\)")].reset_index(drop=True)
school_enrollment_preprimary =data[data.Indicator.str.contains("School enrollment, preprimary \(% gross\)")].reset_index(drop=True)

school_enrollment_primary = data[data.Indicator.str.contains("School enrollment, primary \(% gross\)")].reset_index(drop=True)
school_enrollment_secondary = data[data.Indicator.str.contains("School enrollment, secondary \(% gross\)")].reset_index(drop=True)

pre_primary_duration = data[data.Indicator.str.contains("Preprimary education, duration \(years\)")].reset_index(drop=True)
primary_ed_duration = data[data.Indicator.str.contains("Primary education, duration")].reset_index(drop=True)


# Additional Indicators we can add are listed at the bottom
population_14 = data[data.Indicator.str.contains("Population ages 0-14 \(% of total ")].reset_index(drop=True)



### How much data is missing per indicator

Below I checked the output of each indicator to see what percentage of data is missing. For example, for "Access to clearn fuels and technologies for cooking" - (cooking_tech), there is no data for 11% of the countries in the dataset. I think a good line in the sand is to focus on indicators for which at least 20% of the countries have data. Thoughts? **There are a few that don't meet this criteria below.



In [133]:
undernourishment

Unnamed: 0,Country,Indicator,2018
0,Africa Eastern and Southern,Prevalence of undernourishment (% of population),23.405499
1,Africa Western and Central,Prevalence of undernourishment (% of population),12.420730
2,Arab World,Prevalence of undernourishment (% of population),13.607920
3,Caribbean small states,Prevalence of undernourishment (% of population),6.992515
4,Central Europe and the Baltics,Prevalence of undernourishment (% of population),2.605544
...,...,...,...
261,Virgin Islands (U.S.),Prevalence of undernourishment (% of population),
262,West Bank and Gaza,Prevalence of undernourishment (% of population),
263,"Yemen, Rep.",Prevalence of undernourishment (% of population),45.400000
264,Zambia,Prevalence of undernourishment (% of population),


In [136]:
# Economic and access (water, electricity, fuel for cooking) indicators 
indicators = [cooking_tech,
    access_electricity,
    income_per_capita,
    education_expenditure_prim,
    drinking_water_services,
    basic_sanitation,
    nurses_midwives,
    physicians,
    births_skilled_staff,
    undernourishment,
    sev_food_insecurity_all,
    mod_sev_food_insecurity_all,
    adolescent_fertility_rate,
    adolescents_out_of_school,
    health_expediture ,
    fertility_rate ,
    measles_immunizaton ,
    life_expectancy_at_birth_female,
    life_expectancy_at_birth_male, 
    life_expectancy_at_birth_total,
    mortality_rate_infant,
    mortality_rate_infant_male,
    mortality_rate_infant_female,
    mortality_rate_under_5,
    mortality_rate_under_5_female,
    mortality_rate_under_5_male,
    anemia_children,
    stunting_height_age,
    overweight_children_under_5,


    children_out_of_school,
    children_out_of_school_prim ,
    compulsory_education,
    lower_secondary_school_starting_age,
    primary_edu_duration,
    primary_starting_age,
    probability_dying_10_14,
    probability_dying_15_19,
    probability_dying_5_9,
    school_enrollment_preprimary,

    school_enrollment_primary,
    school_enrollment_secondary,

    pre_primary_duration,
    primary_ed_duration,
    population_14]



In [135]:
for i in indicators:
    pct_missing = np.mean(i["2018"].isnull())
    printmd(f" **{i.Indicator[0]}:**  {round(pct_missing*100)}%")



 **Access to clean fuels and technologies for cooking (% of population):**  11%

 **Access to electricity (% of population):**  1%

 **Adjusted net national income per capita (annual % growth):**  25%

 **Current education expenditure, primary (% of total expenditure in primary public institutions):**  68%

 **People using at least basic drinking water services (% of population):**  6%

 **People using at least basic sanitation services (% of population):**  8%

 **Nurses and midwives (per 1,000 people):**  54%

 **Physicians (per 1,000 people):**  64%

 **Births attended by skilled health staff (% of total):**  59%

 **Prevalence of undernourishment (% of population):**  21%

 **Prevalence of severe food insecurity in the population (%):**  52%

 **Prevalence of moderate or severe food insecurity in the population (%):**  52%

 **Adolescent fertility rate (births per 1,000 women ages 15-19):**  9%

 **Adolescents out of school (% of lower secondary school age):**  40%

 **Current health expenditure per capita (current US$):**  12%

 **Fertility rate, total (births per woman):**  7%

 **Immunization, measles (% of children ages 12-23 months):**  10%

 **Life expectancy at birth, female (years):**  8%

 **Life expectancy at birth, male (years):**  8%

 **Life expectancy at birth, total (years):**  8%

 **Mortality rate, infant (per 1,000 live births):**  9%

 **Mortality rate, infant, male (per 1,000 live births):**  9%

 **Mortality rate, infant, female (per 1,000 live births):**  9%

 **Mortality rate, under-5 (per 1,000 live births):**  9%

 **Mortality rate, under-5, female (per 1,000 live births):**  9%

 **Mortality rate, under-5, male (per 1,000 live births):**  9%

 **Prevalence of anemia among children (% of children ages 6-59 months):**  11%

 **Prevalence of stunting, height for age (modeled estimate, % of children under 5):**  25%

 **Prevalence of overweight (modeled estimate, % of children under 5):**  25%

 **Children out of school (% of primary school age):**  48%

 **Children out of school, primary:**  49%

 **Compulsory education, duration (years):**  8%

 **Lower secondary school starting age (years):**  21%

 **Primary school starting age (years):**  21%

 **Probability of dying among adolescents ages 10-14 years (per 1,000):**  9%

 **Probability of dying among adolescents ages 15-19 years (per 1,000):**  9%

 **Probability of dying among children ages 5-9 years (per 1,000):**  9%

 **School enrollment, preprimary (% gross):**  27%

 **School enrollment, primary (% gross):**  27%

 **School enrollment, secondary (% gross):**  36%

 **Preprimary education, duration (years):**  7%

 **Primary education, duration (years):**  3%

 **Population ages 0-14 (% of total population):**  9%

In [None]:
for col in cooking_tech.columns:
    print(col)

In [None]:
for col in cooking_tech.columns:
    pct_missing = np.mean(cooking_tech[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in access_electricity.columns:
    pct_missing = np.mean(access_electricity[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in income_per_capita.columns:
    pct_missing = np.mean(income_per_capita[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in adolescent_fertility_rate.columns:
    pct_missing = np.mean(adolescent_fertility_rate[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in adolescents_out_of_school.columns:
    pct_missing = np.mean(adolescents_out_of_school[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in children_out_of_school.columns:
    pct_missing = np.mean(children_out_of_school[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in children_out_of_school_prim.columns:
    pct_missing = np.mean(children_out_of_school_prim[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in compulsory_education.columns:
    pct_missing = np.mean(compulsory_education[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in education_expenditure_prim.columns:
    pct_missing = np.mean(education_expenditure_prim[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")


In [None]:
for col in health_expediture.columns:
    pct_missing = np.mean(health_expediture[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")


In [None]:
for col in fertility_rate.columns:
    pct_missing = np.mean(fertility_rate[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")


In [None]:
for col in measles_immunizaton.columns:
    pct_missing = np.mean(measles_immunizaton[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in life_expectancy_at_birth_female.columns:
    pct_missing = np.mean(life_expectancy_at_birth_female[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in life_expectancy_at_birth_male.columns:
    pct_missing = np.mean(life_expectancy_at_birth_male[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in life_expectancy_at_birth_total.columns:
    pct_missing = np.mean(life_expectancy_at_birth_total[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")
    

In [None]:
for col in lower_secondary_school_starting_age.columns:
    pct_missing = np.mean(lower_secondary_school_starting_age[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")
    

In [None]:
for col in mortality_rate_infant.columns:
    pct_missing = np.mean(mortality_rate_infant[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")
    

In [None]:
for col in mortality_rate_infant_male.columns:
    pct_missing = np.mean(mortality_rate_infant[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")
    

In [None]:
for col in mortality_rate_infant_female.columns:
    pct_missing = np.mean(mortality_rate_infant[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")
    

In [None]:
for col in mortality_rate_under_5.columns:
    pct_missing = np.mean(mortality_rate_under_5[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in mortality_rate_under_5_female.columns:
    pct_missing = np.mean(mortality_rate_under_5_female[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in mortality_rate_under_5_male.columns:
    pct_missing = np.mean(mortality_rate_under_5_male[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:

for col in nurses_midwives.columns:
    pct_missing = np.mean(nurses_midwives[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in drinking_water_services.columns:
    pct_missing = np.mean(drinking_water_services[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in basic_sanitation.columns:
    pct_missing = np.mean(basic_sanitation[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")

In [None]:
for col in physicians.columns:
    pct_missing = np.mean(physicians[col].isnull())
    print(f"{col} Percentage of data missing - {round(pct_missing*100)}%")


### Some additional indicators to consider: 

Population ages 00-04, female (% of female population)

Population ages 00-04, male (% of male population)

Population ages 0-14 (% of total population)


Population ages 0-14, female (% of female population)

Population ages 0-14, male (% of male population)

Population ages 05-09, female (% of female population)

Population ages 05-09, male (% of male population)

Population ages 10-14, female (% of female population)

Population ages 10-14, male (% of male population)

Population ages 15-19, female (% of female population)

Population ages 15-19, male (% of male population)


Preprimary education, duration (years)

Prevalence of anemia among children (% of children ages 6-59 months)

Prevalence of moderate or severe food insecurity in the population (%)

Prevalence of overweight (modeled estimate, % of children under 5)

Prevalence of overweight, weight for height (% of children under 5)

Prevalence of severe food insecurity in the population (%)

Prevalence of stunting, height for age (modeled estimate, % of children under 5)

Prevalence of undernourishment (% of population)

Primary education, duration (years)

Primary education, pupils

Primary school starting age (years)

Probability of dying among adolescents ages 10-14 years (per 1,000)

Probability of dying among adolescents ages 15-19 years (per 1,000)

Probability of dying among children ages 5-9 years (per 1,000)


Gross or net of these: 

School enrollment, preprimary (% gross)

School enrollment, preprimary, female (% gross)

School enrollment, preprimary, male (% gross)

School enrollment, primary (% gross)

School enrollment, primary (% net)

School enrollment, primary (gross), gender parity index (GPI)

School enrollment, primary, female (% gross)

School enrollment, primary, female (% net)

School enrollment, primary, male (% gross)

School enrollment, primary, male (% net)

School enrollment, primary, private (% of total primary)

School enrollment, secondary (% gross)

School enrollment, secondary (% net)

School enrollment, secondary, female (% gross)

School enrollment, secondary, female (% net)

School enrollment, secondary, male (% gross)

School enrollment, secondary, male (% net)
