# Data Wrangling for datasets | ADSE 2025

### Correlate Anemia Prevalence, Poverty level, and Malnutrition prevalence

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import scipy
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt

#### Anemia Prevalence, 5 and under

In [None]:
Anemia_U5 = pd.read_csv(r'C:\Users\Leon Pascual\Documents\GitHub\CPE311_Computational-Thinking-with-Python\Final\ADSE\datasets\API_SH.ANM.CHLD.ZS_DS2_en_csv_v2_104143\PVA_U5.csv', skiprows=4)
Anemia_U5.head()

In [None]:
# drop empty columns
Anemia_U5 = Anemia_U5.drop(['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999'], axis=1)

In [None]:
# Step 1: Melt the DataFrame
whole_melted = Anemia_U5.melt(
    id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"], 
    var_name="Year", 
    value_name="Value"
)

# Step 2: Keep only numeric year rows
whole_melted = whole_melted[whole_melted["Year"].str.isdigit()]

# Step 3: Convert Year to integer
whole_melted["Year"] = whole_melted["Year"].astype(int)

# Step 4: Sort the values (optional but useful)
whole_melted = whole_melted.sort_values(by=["Year", "Country Name"])

# Final result
whole_melted


In [None]:
# remove rows w/ missing values
whole_melted = whole_melted.dropna()
whole_melted

In [None]:
# 1. Compute yearly averages from whole_melted
yearly_averages = whole_melted.groupby('Year', as_index=False)['Value'].mean()

# 2. Build your “average” rows
avg_rows = pd.DataFrame({
    'Country Name': 'Average',
    'Country Code': 'AVG',
    'Indicator Name': 'Prevalence of anemia among children (% of children ages 6-59 months)',
    'Indicator Code': 'SH.ANM.CHLD.ZS',
    'Year': yearly_averages['Year'],
    'Value': yearly_averages['Value']
})

# 3. Append to the original melted DataFrame
whole_with_averages = pd.concat([whole_melted, avg_rows], ignore_index=True)

# 4. Round the 'Value' column to two decimals
whole_with_averages['Value'] = whole_with_averages['Value'].round(2)

# 5. Sort by Year (and Country if you like)
whole_with_averages = whole_with_averages.sort_values(by=['Year', 'Country Name'])

# 6. (Optional) View just the average rows
whole_with_averages.query("`Country Name` == 'Average'")


In [None]:
whole_with_averages['Year_Date'] = pd.to_datetime(whole_with_averages['Year'].astype(str) + '-01-01')


In [None]:
whole_with_averages.to_csv(r'C:\Users\Leon Pascual\Documents\GitHub\CPE311_Computational-Thinking-with-Python\Final\ADSE\datasets\exported\whole_with_averages.csv', index=False)

In [None]:
southeast_asia = [
    "Brunei Darussalam",
    "Cambodia",
    "Indonesia",
    "Lao PDR",
    "Malaysia",
    "Myanmar",
    "Philippines",
    "Singapore",
    "Thailand",
    "Timor-Leste",
    "Viet Nam"
]
# filter SEA countries
SEA_Anemia_U5 = Anemia_U5.query("`Country Name` in @southeast_asia")
SEA_Anemia_U5.head(15)

In [None]:
southeast_asia = [
    "Brunei Darussalam",
    "Cambodia",
    "Indonesia",
    "Lao PDR",
    "Malaysia",
    "Myanmar",
    "Philippines",
    "Singapore",
    "Thailand",
    "Timor-Leste",
    "Viet Nam"
]
# filter out SEA countries
World_Anemia_U5 = Anemia_U5.query("`Country Name` not in @southeast_asia")
World_Anemia_U5.head(15)

In [None]:
# melt dataframe to merge column values together
SEA_Anemia_U5_melted = SEA_Anemia_U5.melt(id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"], 
                                   var_name="Year", 
                                   value_name="Value"
                                   )

SEA_Anemia_U5_melted = SEA_Anemia_U5_melted[SEA_Anemia_U5_melted["Year"].str.isdigit()]
SEA_Anemia_U5_melted["Year"] = SEA_Anemia_U5_melted["Year"].astype(int)

SEA_Anemia_U5_melted.sort_values(by=["Year","Country Name"])
SEA_Anemia_U5_melted

In [None]:
# Step 1: Melt the wide-format dataframe to long format
World_Anemia_U5_melted = World_Anemia_U5.melt(
    id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
    var_name="Year",
    value_name="Value"
)

# Step 2: Keep only rows where Year is a number
World_Anemia_U5_melted = World_Anemia_U5_melted[World_Anemia_U5_melted["Year"].str.isdigit()]

# Step 3: Convert Year to integer
World_Anemia_U5_melted["Year"] = World_Anemia_U5_melted["Year"].astype(int)

# Step 4: Sort by Year and Country Name (optional, but useful)
World_Anemia_U5_melted = World_Anemia_U5_melted.sort_values(by=["Year", "Country Name"])

# View result
World_Anemia_U5_melted.head()


In [None]:
# remove rows w/ missing values
SEA_Anemia_U5_melted = SEA_Anemia_U5_melted.dropna()
SEA_Anemia_U5_melted

In [None]:
# remove rows w/ missing values
World_Anemia_U5_melted = World_Anemia_U5_melted.dropna()
World_Anemia_U5_melted

In [None]:
SEA_Anemia_U5_melted.dtypes

In [None]:
yearly_averages = SEA_Anemia_U5_melted.groupby('Year', as_index=False)['Value'].mean()

avg_rows = pd.DataFrame({
    'Country Name': 'Average',
    'Country Code': 'AVG',
    'Indicator Name': 'Prevalence of anemia among children (% of children ages 6-59 months)',
    'Indicator Code': 'SH.ANM.CHLD.ZS',
    'Year': yearly_averages['Year'],
    'Value': yearly_averages['Value']
})

df_with_averages = pd.concat([SEA_Anemia_U5_melted, avg_rows], ignore_index=True)

# Round the Value column
df_with_averages['Value'] = df_with_averages['Value'].round(2)

# Optional: sort by year
df_with_averages = df_with_averages.sort_values(by='Year')

df_with_averages.query("`Country Name` == 'Average'")


In [None]:
SEA_Anemia_U5_melted[SEA_Anemia_U5_melted['Year'] == 2019].sort_values('Value', ascending=False).head(1)

In [None]:
# Step 1: Compute yearly averages
yearly_averages = World_Anemia_U5_melted.groupby('Year', as_index=False)['Value'].mean()

# Step 2: Create rows labeled as "Average"
avg_rows = pd.DataFrame({
    'Country Name': 'Average',
    'Country Code': 'AVG',
    'Indicator Name': 'Prevalence of anemia among children (% of children ages 6-59 months)',
    'Indicator Code': 'SH.ANM.CHLD.ZS',
    'Year': yearly_averages['Year'],
    'Value': yearly_averages['Value']
})

# Step 3: Append the average rows to the original DataFrame
World_Anemia_U5_with_averages = pd.concat([World_Anemia_U5_melted, avg_rows], ignore_index=True)

# Step 4: Round the 'Value' column to 2 decimal places
World_Anemia_U5_with_averages['Value'] = World_Anemia_U5_with_averages['Value'].round(2)

# Step 5: Sort (optional)
World_Anemia_U5_with_averages = World_Anemia_U5_with_averages.sort_values(by='Year')

# Step 6: View only the average rows (optional)
World_Anemia_U5_with_averages.query("`Country Name` == 'Average'")


In [None]:
df_with_averages['Year_Date'] = pd.to_datetime(df_with_averages['Year'].astype(str) + '-01-01')


In [None]:
World_Anemia_U5_with_averages['Year_Date'] = pd.to_datetime(World_Anemia_U5_with_averages['Year'].astype(str) + '-01-01')


In [None]:
SEA_Anemia_U5_with_averages = df_with_averages
SEA_Anemia_U5_with_averages.head(15)

In [None]:

World_Anemia_U5_with_averages.head(15)

In [None]:
SEA_Anemia_U5_with_averages.to_csv(r'C:\Users\Leon Pascual\Documents\GitHub\CPE311_Computational-Thinking-with-Python\Final\ADSE\datasets\exported\SEA_Anemia_U5_with_averages.csv', index=False)

In [None]:
World_Anemia_U5_with_averages.to_csv(r'C:\Users\Leon Pascual\Documents\GitHub\CPE311_Computational-Thinking-with-Python\Final\ADSE\datasets\exported\World_Anemia_U5_with_averages.csv', index=False)

#### Anemia Prevalence, women of reproductive age

In [None]:
Anemia_Women = pd.read_csv(r'C:\Users\Leon Pascual\Documents\GitHub\CPE311_Computational-Thinking-with-Python\Final\ADSE\datasets\PVA_WRA_PREG.csv')
Anemia_Women.head()

In [None]:
southeast_asia = [
    "Brunei Darussalam",
    "Cambodia",
    "Indonesia",
    "Lao PDR",
    "Malaysia",
    "Myanmar",
    "Philippines",
    "Singapore",
    "Thailand",
    "Timor-Leste",
    "Viet Nam"
]
# filter SEA countries
SEA_Anemia_Women = Anemia_Women.query("`Location` in @southeast_asia")

In [None]:
# rename columns
SEA_Anemia_Women = SEA_Anemia_Women.rename(columns={
                                                    'Dim1': 'Sex',
                                                    'Dim2': 'Pregnancy Status',
                                                })
# sort values
SEA_Anemia_Women.sort_values(by=['Period','Location','Pregnancy Status'], ascending=True)


In [None]:
SEA_Anemia_Women.columns


In [76]:
SEA_Anemia_Women = SEA_Anemia_Women[['Indicator', 'SpatialDimValueCode', 'Location', 'Period', 'Sex', 'Pregnancy Status', 'FactValueNumeric']]
SEA_Anemia_Women

Unnamed: 0,Indicator,SpatialDimValueCode,Location,Period,Sex,Pregnancy Status,FactValueNumeric
5,Prevalence of anaemia in women of reproductive...,PHL,Philippines,2023,Female,Non-pregnant women (aged 15-49 years),11.6
6,Prevalence of anaemia in women of reproductive...,PHL,Philippines,2023,Female,Women of reproductive age (aged 15-49 years),12.0
90,Prevalence of anaemia in women of reproductive...,SGP,Singapore,2023,Female,Non-pregnant women (aged 15-49 years),16.6
91,Prevalence of anaemia in women of reproductive...,SGP,Singapore,2023,Female,Women of reproductive age (aged 15-49 years),16.6
101,Prevalence of anaemia in women of reproductive...,SGP,Singapore,2023,Female,Pregnant women (aged 15-49 years),16.9
...,...,...,...,...,...,...,...
13842,Prevalence of anaemia in women of reproductive...,MMR,Myanmar,2000,Female,Women of reproductive age (aged 15-49 years),43.6
13853,Prevalence of anaemia in women of reproductive...,KHM,Cambodia,2000,Female,Non-pregnant women (aged 15-49 years),45.6
13859,Prevalence of anaemia in women of reproductive...,KHM,Cambodia,2000,Female,Women of reproductive age (aged 15-49 years),46.4
13913,Prevalence of anaemia in women of reproductive...,MMR,Myanmar,2000,Female,Pregnant women (aged 15-49 years),55.5


In [None]:
SEA_Anemia_Women['Year_Date'] = pd.to_datetime(SEA_Anemia_Women_NPW['Period'].astype(str) + '-01-01')

In [75]:
SEA_Anemia_Women[SEA_Anemia_Women['Pregnancy Status'] == 'Women of reproductive age (aged 15-49 years)'].sort_values('FactValueNumeric', ascending=False).head(1)

Unnamed: 0,Indicator,SpatialDimValueCode,Location,Period,Sex,Pregnancy Status,FactValueNumeric,Year_Date
13859,Prevalence of anaemia in women of reproductive...,KHM,Cambodia,2000,Female,Women of reproductive age (aged 15-49 years),46.4,NaT


In [None]:
SEA_Anemia_Women['Pregnancy Status'].unique()

array(['Non-pregnant women (aged 15-49 years)',
       'Women of reproductive age (aged 15-49 years)',
       'Pregnant women (aged 15-49 years)'], dtype=object)

In [84]:
SEA_Anemia_Women['Indicator'].unique()

array(['Prevalence of anaemia in women of reproductive age (aged 15-49) (%)'],
      dtype=object)

In [78]:
# non-pregnant
SEA_Anemia_Women_ALL = SEA_Anemia_Women[SEA_Anemia_Women['Pregnancy Status'] == 'Women of reproductive age (aged 15-49 years)']
SEA_Anemia_Women_ALL.sort_values(by=['Period','Location',], ascending=True)

Unnamed: 0,Indicator,SpatialDimValueCode,Location,Period,Sex,Pregnancy Status,FactValueNumeric
13448,Prevalence of anaemia in women of reproductive...,BRN,Brunei Darussalam,2000,Female,Women of reproductive age (aged 15-49 years),18.2
13859,Prevalence of anaemia in women of reproductive...,KHM,Cambodia,2000,Female,Women of reproductive age (aged 15-49 years),46.4
13705,Prevalence of anaemia in women of reproductive...,IDN,Indonesia,2000,Female,Women of reproductive age (aged 15-49 years),31.7
13753,Prevalence of anaemia in women of reproductive...,MYS,Malaysia,2000,Female,Women of reproductive age (aged 15-49 years),33.3
13842,Prevalence of anaemia in women of reproductive...,MMR,Myanmar,2000,Female,Women of reproductive age (aged 15-49 years),43.6
...,...,...,...,...,...,...,...
6,Prevalence of anaemia in women of reproductive...,PHL,Philippines,2023,Female,Women of reproductive age (aged 15-49 years),12.0
91,Prevalence of anaemia in women of reproductive...,SGP,Singapore,2023,Female,Women of reproductive age (aged 15-49 years),16.6
142,Prevalence of anaemia in women of reproductive...,THA,Thailand,2023,Female,Women of reproductive age (aged 15-49 years),20.7
357,Prevalence of anaemia in women of reproductive...,TLS,Timor-Leste,2023,Female,Women of reproductive age (aged 15-49 years),29.7


In [80]:
# remove rows w/ missing values
SEA_Anemia_Women_ALL = SEA_Anemia_Women_ALL.dropna()


In [86]:
# 1. Compute yearly averages from whole_melted
AW_yearly_averages = SEA_Anemia_Women_ALL.groupby('Period', as_index=False)['FactValueNumeric'].mean()

# 2. Build your “average” rows
avg_rows = pd.DataFrame({
    'Location': 'Average',
    'SpatialDimValueCode': 'AVG',
    'Indicator': 'Prevalence of anaemia in women of reproductive age (aged 15-49) (%)',

    'Period': AW_yearly_averages['Period'],
    'Value': AW_yearly_averages['FactValueNumeric']
})

# 3. Append to the original melted DataFrame
AW_with_averages = pd.concat([SEA_Anemia_Women_ALL, avg_rows], ignore_index=True)

# 4. Round the 'Value' column to two decimals
AW_with_averages['FactValueNumeric'] = AW_with_averages['FactValueNumeric'].round(2)

# 5. Sort by Year (and Country if you like)
AW_with_averages = AW_with_averages.sort_values(by=['Period', 'Location'])

# 6. (Optional) View just the average rows
AW_with_averages.query("`Location` == 'Average'")


Unnamed: 0,Indicator,SpatialDimValueCode,Location,Period,Sex,Pregnancy Status,FactValueNumeric,Value
240,Prevalence of anaemia in women of reproductive...,AVG,Average,2000,,,,29.16
241,Prevalence of anaemia in women of reproductive...,AVG,Average,2001,,,,28.74
242,Prevalence of anaemia in women of reproductive...,AVG,Average,2002,,,,28.38
243,Prevalence of anaemia in women of reproductive...,AVG,Average,2003,,,,28.04
244,Prevalence of anaemia in women of reproductive...,AVG,Average,2004,,,,27.72
245,Prevalence of anaemia in women of reproductive...,AVG,Average,2005,,,,27.4
246,Prevalence of anaemia in women of reproductive...,AVG,Average,2006,,,,27.12
247,Prevalence of anaemia in women of reproductive...,AVG,Average,2007,,,,26.82
248,Prevalence of anaemia in women of reproductive...,AVG,Average,2008,,,,26.48
249,Prevalence of anaemia in women of reproductive...,AVG,Average,2009,,,,26.13


In [87]:
AW_with_averages['Year_Date'] = pd.to_datetime(AW_with_averages['Period'].astype(str) + '-01-01')
AW_with_averages.head(15)
AW_with_averages.to_csv(r'C:\Users\Leon Pascual\Documents\GitHub\CPE311_Computational-Thinking-with-Python\Final\ADSE\datasets\exported\AW_with_averages.csv', index=False)

In [None]:
SEA_Anemia_Women_NPW.dtypes

In [88]:
# pregnant
SEA_Anemia_Women_PW = SEA_Anemia_Women[SEA_Anemia_Women['Pregnancy Status'] == 'Pregnant women (aged 15-49 years)']
SEA_Anemia_Women_PW.sort_values(by=['Period','Location',], ascending=True)

Unnamed: 0,Indicator,SpatialDimValueCode,Location,Period,Sex,Pregnancy Status,FactValueNumeric
13674,Prevalence of anaemia in women of reproductive...,BRN,Brunei Darussalam,2000,Female,Pregnant women (aged 15-49 years),30.5
13922,Prevalence of anaemia in women of reproductive...,KHM,Cambodia,2000,Female,Pregnant women (aged 15-49 years),56.6
13813,Prevalence of anaemia in women of reproductive...,IDN,Indonesia,2000,Female,Pregnant women (aged 15-49 years),38.6
13785,Prevalence of anaemia in women of reproductive...,MYS,Malaysia,2000,Female,Pregnant women (aged 15-49 years),35.6
13913,Prevalence of anaemia in women of reproductive...,MMR,Myanmar,2000,Female,Pregnant women (aged 15-49 years),55.5
...,...,...,...,...,...,...,...
212,Prevalence of anaemia in women of reproductive...,PHL,Philippines,2023,Female,Pregnant women (aged 15-49 years),24.0
101,Prevalence of anaemia in women of reproductive...,SGP,Singapore,2023,Female,Pregnant women (aged 15-49 years),16.9
207,Prevalence of anaemia in women of reproductive...,THA,Thailand,2023,Female,Pregnant women (aged 15-49 years),23.8
375,Prevalence of anaemia in women of reproductive...,TLS,Timor-Leste,2023,Female,Pregnant women (aged 15-49 years),30.7


In [89]:
# remove rows w/ missing values
SEA_Anemia_Women_PW = SEA_Anemia_Women_PW.dropna()


In [90]:
# 1. Compute yearly averages from whole_melted
PW_yearly_averages = SEA_Anemia_Women_PW.groupby('Period', as_index=False)['FactValueNumeric'].mean()

# 2. Build your “average” rows
avg_rows = pd.DataFrame({
    'Location': 'Average',
    'SpatialDimValueCode': 'AVG',
    'Indicator': 'Prevalence of anaemia in women of reproductive age (aged 15-49) (%)',

    'Period': PW_yearly_averages['Period'],
    'FactValueNumeric': PW_yearly_averages['FactValueNumeric']
})

# 3. Append to the original melted DataFrame
PW_with_averages = pd.concat([SEA_Anemia_Women_PW, avg_rows], ignore_index=True)

# 4. Round the 'Value' column to two decimals
PW_with_averages['FactValueNumeric'] = PW_with_averages['FactValueNumeric'].round(2)

# 5. Sort by Year (and Country if you like)
PW_with_averages = PW_with_averages.sort_values(by=['Period', 'Location'])

# 6. (Optional) View just the average rows
PW_with_averages.query("`Location` == 'Average'")


Unnamed: 0,Indicator,SpatialDimValueCode,Location,Period,Sex,Pregnancy Status,FactValueNumeric
240,Prevalence of anaemia in women of reproductive...,AVG,Average,2000,,,38.87
241,Prevalence of anaemia in women of reproductive...,AVG,Average,2001,,,38.22
242,Prevalence of anaemia in women of reproductive...,AVG,Average,2002,,,37.58
243,Prevalence of anaemia in women of reproductive...,AVG,Average,2003,,,36.94
244,Prevalence of anaemia in women of reproductive...,AVG,Average,2004,,,36.28
245,Prevalence of anaemia in women of reproductive...,AVG,Average,2005,,,35.68
246,Prevalence of anaemia in women of reproductive...,AVG,Average,2006,,,35.07
247,Prevalence of anaemia in women of reproductive...,AVG,Average,2007,,,34.46
248,Prevalence of anaemia in women of reproductive...,AVG,Average,2008,,,33.85
249,Prevalence of anaemia in women of reproductive...,AVG,Average,2009,,,33.25


In [None]:
PW_with_averages['Year_Date'] = pd.to_datetime(PW_with_averages['Period'].astype(str) + '-01-01')
PW_with_averages.head(15)
PW_with_averages.to_csv(r'C:\Users\Leon Pascual\Documents\GitHub\CPE311_Computational-Thinking-with-Python\Final\ADSE\datasets\exported\PW_with_averages.csv', index=False)

In [None]:
SEA_Anemia_Women_NPW[SEA_Anemia_Women_NPW['FactValueNumeric'] == 37.6]

In [None]:
idx

In [None]:
GINI = pd.read_csv(r'C:\Users\Leon Pascual\Documents\GitHub\CPE311_Computational-Thinking-with-Python\Final\ADSE\datasets\API_SI.POV.GINI_DS2_en_csv_v2_131994\GINI Index.csv', skiprows=4)
GINI.head()