<h1> Final Project 1 Jupyter Notebook: India, Pakistan, and Bangladesh </h1>
<h2> Group 1: Dorothy Thomas </h2>
<br/>
<p>Authors: Rishi Boddu, Kita Hu, Sage Tulabing, Leanna Baltonado</p>
<br/>
<p>In this Jupyter Notebook, we'll be introducing population functions, population pyramid functions and more in order to quantify the 1994 Rwandan Genocide using data analysis. We primarily utilize the WBData Population dataset in order to compile data containing population for each age, gender, year, and country. We cite the information from WB Data. https://wbdata.readthedocs.io. We primarily aim to analyze the 1994 Rwandan Genocide using graphs and visualizations.</p>

In [67]:
# installations and importing data

%pip install plotly
%pip install wbdata
%pip install eep153_tools
%pip install python_gnupg
%pip install -U gspread_pandas

from __future__ import annotations
import time
from functools import lru_cache
from typing import Literal
import wbdata
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

In [68]:
wbdata.get_countries(query="India")[0]['id']

In [69]:
wbdata.get_countries(query="Pakistan")[0]['id']

In [70]:
wbdata.get_countries(query="Bangladesh")[0]['id']

In [71]:
#wbdata.get_indicators()
SOURCE = 40 # "Population estimates and projections

indicators = wbdata.get_indicators(source=SOURCE)
indicators

<h1> [A] Population Dataframe</h1>

<h4> Helper Functions </h4>

In [72]:
#Helper Functions

Sex = Literal["female", "male", "total"]

AGE_BINS_5Y = [
    "0004","0509","1014","1519","2024","2529","3034","3539",
    "4044","4549","5054","5559","6064","6569","7074","7579","80UP"
]

BIN_TO_RANGE = {
    "0004": (0,4),   "0509": (5,9),   "1014": (10,14), "1519": (15,19),
    "2024": (20,24), "2529": (25,29), "3034": (30,34), "3539": (35,39),
    "4044": (40,44), "4549": (45,49), "5054": (50,54), "5559": (55,59),
    "6064": (60,64), "6569": (65,69), "7074": (70,74), "7579": (75,79),
    "80UP": (80,200),
}

def bins_for_age_range(start_age: int, end_age: int) -> list[str]:
    ''' >>> bins_for_age_range(start_age = 0, end_age = 9)
        return: ['0004', '0509']
        '''
    return [b for b,(lo,hi) in BIN_TO_RANGE.items() if not (hi < start_age or lo > end_age)]

def wb_indicator_5y(age_code: str, sex: str) -> str:
    ''' This helper constructs variable indicator.
    >>> wb_indicator_5y(age_code = '0004', sex = 'female')
    output: "SP.POP.0004.FE"
    '''
    
    sex_code = "FE" if sex == "female" else "MA"
    return f"SP.POP.{age_code}.{sex_code}"

<h4> Validation Functions </h4>

In [73]:
#validation functions 
def _validate_years(years: tuple[int, int]) -> None:
    if (not isinstance(years, tuple)) or len(years) != 2:
        raise TypeError("years must be a tuple of (start_year, end_year)")
    y0, y1 = years
    if not (isinstance(y0, int) and isinstance(y1, int)):
        raise TypeError("years must contain integers")
    if y0 > y1:
        raise ValueError("years must satisfy start_year <= end_year")
    if y0 < 1960 or y1 > 2024:
        raise ValueError("years must be between 1960 and 2024, inclusive")


def _validate_area(area: str) -> str:
    if not isinstance(area, str):
        raise TypeError("area must be a string")
    area = area.strip()
    if not area:
        raise ValueError("area must be a non-empty string")
    return area


def _validate_sex(sex: str) -> Sex:
    if not isinstance(sex, str):
        raise TypeError("sex must be a string")
    sex = sex.strip().lower()
    if sex not in ("female", "male", "total"):
        raise ValueError("sex must be 'female', 'male', or 'total'")
    return sex  # type: ignore[return-value]


def _validate_age_range(age_range: tuple[int, int]) -> tuple[int, int]:
    if (not isinstance(age_range, tuple)) or len(age_range) != 2:
        raise TypeError("age_range must be a tuple (low, high)")
    low, high = age_range
    if not (isinstance(low, int) and isinstance(high, int)):
        raise TypeError("age_range bounds must be integers")
    if low < 0 or high < 0:
        raise ValueError("age_range bounds must be non-negative")
    if low > high:
        raise ValueError("age_range must satisfy low <= high")

    # Exact 5-year boundary validation:
    # valid ranges are unions of 5-year bins, so:
    #   low must be multiple of 5
    #   high must be 4 mod 5
    if (low % 5) != 0 or (high % 5) != 4:
        raise ValueError(
            "age_range must align to 5-year bins, e.g. (0,4), (0,9), (5,14), (15,19)"
        )

    # Optional: enforce max bound if your bins top out (e.g., 100+)
    # You can skip this if AGE_BINS_5Y already handles it robustly.
    return low, high

<h4> The Actual Population Dataframe Function </h4>

In [74]:
@lru_cache(maxsize=128)
def population_df(years: tuple[int, int], area: str, chunk: int = 4) -> pd.DataFrame:
    """
    Example: india_pop_df = population_df(years=(2018, 2020), area="India")
    Returns a DataFrame with male and female counts of each 5-year age range.
    Please enter a year between 1960 and 2024. 
    """
    _validate_years(years)
    area = _validate_area(area)
    if not isinstance(chunk, int) or chunk <= 0:
        raise ValueError("chunk must be a positive integer")

    # Resolve country id (kept inside cached function; caching avoids repeat lookups too)
    countries = wbdata.get_countries(query=f"^{re.escape(area)}$")
    if not countries:
        raise ValueError(f"No country matched area={area!r}")
    country_id = countries[0]["id"]

    def fetch(sub_bins: list[str]) -> pd.DataFrame:
        indicators: dict[str, str] = {}
        for b in sub_bins:
            indicators[wb_indicator_5y(b, "female")] = f"female_{b}"
            indicators[wb_indicator_5y(b, "male")] = f"male_{b}"

        last_err: Exception | None = None
        for k in range(5):
            try:
                # Note: skip_cache=True means "do not use wbdata's cache".
                # Our lru_cache handles caching at the function level.
                return wbdata.get_dataframe(
                    indicators,
                    country=country_id,
                    parse_dates=True,
                    skip_cache=True,
                )
            except Exception as e:
                last_err = e
                time.sleep(0.5 * (2 ** k))
        assert last_err is not None
        raise last_err

    parts: list[pd.DataFrame] = []
    for i in range(0, len(AGE_BINS_5Y), chunk):
        parts.append(fetch(AGE_BINS_5Y[i : i + chunk]))

    df = pd.concat(parts, axis=1)

    # Ensure datetime-like index
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index.astype(str), errors="coerce")

    y0, y1 = years
    df = df[(df.index.year >= y0) & (df.index.year <= y1)]

    # Consistent column order: female bins then male bins
    df = df[[f"female_{b}" for b in AGE_BINS_5Y] + [f"male_{b}" for b in AGE_BINS_5Y]]

    df.index = df.index.year
    return df

<h4> Example Usage </h4>

In [75]:
# india
india_pop_df = population_df(
    years=(2018, 2020),
    area="India"
)
india_pop_df

In [76]:
# pakistan
pakistan_pop_df = population_df(
    years=(2018, 2020),
    area="Pakistan"
)
pakistan_pop_df

In [77]:
# bangladesh
bangladesh_pop_df = population_df(
    years=(2018, 2020),
    area="Bangladesh"
)
bangladesh_pop_df

<h1>[A] Population Statistics</h1>

<p> This function answers the question: In [year] how many [people/males/females] aged [low] to [high] were living in [the world/region/country]? </p>

In [78]:
def population_stats(
    year: int,
    sex: str,
    age_range: tuple[int, int],
    area: str,
) -> int:
    """
    Returns the population count as an integer.
    For variable sex, please enter 'female, 'male', or 'total'.
    Age range should also follow five the head and tail of 5-year range
    ex. 0-4, 5-9, 5-14, 15-29 

    Example:
    pakistan_pop_stats = population_stats(year= 2000, 
                     sex= 'total', 
                     age_range= (15, 29), 
                     area= 'Pakistan')
    output: 41045700
    """
    if not isinstance(year, int):
        raise TypeError("year must be an int")
    if year < 1960 or year > 2024:
        raise ValueError("year must be between 1960 and 2024")

    sex_v = _validate_sex(sex)
    low, high = _validate_age_range(age_range)
    area = _validate_area(area)

    df = population_df((year, year), area)
    if df.empty:
        raise ValueError(f"No data returned for {area!r} in {year}")

    row = df.iloc[0]
    bins = bins_for_age_range(low, high)

    if sex_v == "female":
        count = sum(row[f"female_{b}"] for b in bins)
    elif sex_v == "male":
        count = sum(row[f"male_{b}"] for b in bins)
    else:  # total
        count = sum(row[f"female_{b}"] + row[f"male_{b}"] for b in bins)

    # Be robust to floats/NaNs coming back from wbdata
    if pd.isna(count):
        raise ValueError("Population count is NaN (missing data for one or more bins)")
    return int(count)

# formatted population
def format_population_stats_sentence(
    year: int,
    sex: str,
    age_range: tuple[int, int],
    area: str,
) -> str:
    """
    This returns the wrapper that produces the sentence: 
    In [year] how many [people/males/females] aged [low] to [high] 
    were living in [the world/region/country]?

    """
    sex_v = _validate_sex(sex)
    low, high = _validate_age_range(age_range)
    area = _validate_area(area)

    count = population_stats(year, sex_v, (low, high), area)

    who = {"female": "females", "male": "males", "total": "people"}[sex_v]
    return f"In {year}, there were {count:,} {who} aged {low} to {high} living in {area}."

<h4> Example Usage of the Population Function </h4>

In [79]:
india_pop_stats = format_population_stats_sentence(year= 2000, 
                     sex= 'female', 
                     age_range= (15, 29), 
                     area= 'India')
india_pop_stats

In [80]:
pakistan_pop_stats = format_population_stats_sentence(year= 2000, 
                     sex= 'total', 
                     age_range= (15, 29), 
                     area= 'Pakistan')
pakistan_pop_stats

In [87]:
f = format_population_stats_sentence(1974, "female",   (0, 9), "India")
f

<h1> [A] Unit tests </h1>

In [104]:
# fake wbdata for mock data
def _fake_get_countries(query):
    return [{"id": "IND"}]

def _fake_get_dataframe(indicators, country, parse_dates=True, skip_cache=True):
    """
    Return a predictable DataFrame with exactly the columns requested.
    Your population_df renames indicator codes into columns like female_0004, male_0004, etc.
    """
    idx = pd.to_datetime(["1974-01-01", "1980-01-01"])
    cols = list(indicators.values())
    df = pd.DataFrame(index=idx, columns=cols, dtype=float)

    # female_* = 10, male_* = 20 for all years
    for c in cols:
        df[c] = 10 if c.startswith("female_") else 20

    return df


# tests for population_df

def test_population_df_basic_shape_and_columns():
    import wbdata
    old_countries = wbdata.get_countries
    old_dataframe = wbdata.get_dataframe

    wbdata.get_countries = _fake_get_countries
    wbdata.get_dataframe = _fake_get_dataframe

    try:
        if hasattr(population_df, "cache_clear"):
            population_df.cache_clear()

        df = population_df((1974, 1980), "India", chunk=10)

        assert 1974 in df.index, "1974 should be in the DataFrame index"
        assert 1980 in df.index, "1980 should be in the DataFrame index"

        assert "female_0004" in df.columns, "female_0004 column missing"
        assert "male_0004" in df.columns, "male_0004 column missing"

        assert np.isfinite(df.loc[1974, "female_0004"]), "female_0004 is not numeric"
        assert np.isfinite(df.loc[1974, "male_0004"]), "male_0004 is not numeric"

    finally:
        wbdata.get_countries = old_countries
        wbdata.get_dataframe = old_dataframe


def test_population_df_rejects_bad_years_type():
    try:
        population_df([1974, 1980], "India")  # list instead of tuple
        assert False, "Expected TypeError for non-tuple years"
    except TypeError:
        pass


# tests for format_population_stats_sentence

# helper method
def _extract_number_from_sentence(s):
    """
    Works for your exact format:
    'In 1974, there were 83,636,331 females aged 0 to 9 living in India.'
    """
    m = re.search(r"there were ([\d,]+)", s)
    assert m is not None, f"Could not find a number in sentence: {s}"
    return int(m.group(1).replace(",", ""))

def test_population_sentence_uses_df_correctly():
    import wbdata
    old_countries = wbdata.get_countries
    old_dataframe = wbdata.get_dataframe

    wbdata.get_countries = _fake_get_countries
    wbdata.get_dataframe = _fake_get_dataframe

    try:
        if hasattr(population_df, "cache_clear"):
            population_df.cache_clear()

        # for ages 0–9 => bins 0004 and 0509
        # females: 10 + 10 = 20
        # males:   20 + 20 = 40
        # total:   60
        female_sentence = format_population_stats_sentence(1974, "female", (0, 9), "India")
        male_sentence   = format_population_stats_sentence(1974, "male",   (0, 9), "India")
        total_sentence  = format_population_stats_sentence(1974, "total",  (0, 9), "India")

        female_num = _extract_number_from_sentence(female_sentence)
        male_num   = _extract_number_from_sentence(male_sentence)
        total_num  = _extract_number_from_sentence(total_sentence)

        assert female_num == 20, f"Expected 20, got {female_num} | sentence: {female_sentence}"
        assert male_num == 40,   f"Expected 40, got {male_num} | sentence: {male_sentence}"
        assert total_num == 60,  f"Expected 60, got {total_num} | sentence: {total_sentence}"

    finally:
        wbdata.get_countries = old_countries
        wbdata.get_dataframe = old_dataframe

def test_population_sentence_rejects_bad_sex():
    try:
        format_population_stats_sentence(1974, "unknown", (0, 4), "India")
        assert False, "Expected ValueError for invalid sex"
    except ValueError:
        pass


In [105]:
# running tests

def run_unit_tests():
    test_population_df_basic_shape_and_columns()
    test_population_df_rejects_bad_years_type()
    test_population_sentence_uses_df_correctly()
    test_population_sentence_rejects_bad_sex()
    print("✅ All simple unit tests passed!")

run_unit_tests()

✅ All simple unit tests passed!


<h1> [B] Population Pyramids </h1>

In [98]:
AGE_BINS_5Y = [
    "0004","0509","1014","1519","2024","2529","3034","3539",
    "4044","4549","5054","5559","6064","6569","7074","7579","80UP"
]

def _age_label(b):
    if b == "80UP":
        return "80+"
    lo = int(b[:2])
    hi = int(b[2:])
    return f"{lo}-{hi}"

def plot_population_pyramid_5y(df, country_name, year, scale=1e6, ax=None):
    """
    df: DataFrame indexed by year (int), with columns: female_{bin}, male_{bin}
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=(9, 7))

    if year not in df.index:
        raise ValueError(f"{year} not in df.index. Available years: {df.index.min()}–{df.index.max()}")

    row = df.loc[year].copy()

    female_cols = [f"female_{b}" for b in AGE_BINS_5Y]
    male_cols   = [f"male_{b}"   for b in AGE_BINS_5Y]

    # Ensure numeric + handle missing
    female = pd.to_numeric(row[female_cols], errors="coerce").fillna(0).to_numpy() / scale
    male   = pd.to_numeric(row[male_cols],   errors="coerce").fillna(0).to_numpy() / scale

    # Males negative for left side
    male = -male

    age_labels = [_age_label(b) for b in AGE_BINS_5Y]

    ax.barh(age_labels, male,   label="Males",   edgecolor="black", linewidth=0.6)
    ax.barh(age_labels, female, label="Females", edgecolor="black", linewidth=0.6)

    max_val = max(np.max(female), np.max(-male))
    ax.set_xlim(-(max_val * 1.15), max_val * 1.15)

    ax.axvline(0, color="black", linewidth=0.8)
    ax.invert_yaxis()
    ax.set_title(f"Population Pyramid — {country_name} ({year})")
    ax.set_xlabel(f"Population ({'millions' if scale==1e6 else 'units'})")
    ax.set_ylabel("Age group")

    # Show x tick labels as positive on both sides
    ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{abs(x):.1f}"))

    ax.legend()
    plt.tight_layout()
    return ax

<h4> Population Pyramids Displayed for India, Bangladesh, and Pakistan in 1974 </h4>

In [99]:
india_df = population_df(years=(1960, 2024), area="India")
bang_df  = population_df(years=(1960, 2024), area="Bangladesh")
pak_df   = population_df(years=(1960, 2024), area="Pakistan")

plot_population_pyramid_5y(india_df, "India", 1974)
plot_population_pyramid_5y(bang_df, "Bangladesh", 1974)
plot_population_pyramid_5y(pak_df, "Pakistan", 1974)
plt.show()

<h4> Combined Multi-Country Chart - India, Pakistan and Bangladesh in 1980 </h4>

In [100]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
plot_population_pyramid_5y(india_df, "India", 1980, ax=axes[0])
plot_population_pyramid_5y(bang_df,  "Bangladesh", 1980, ax=axes[1])
plot_population_pyramid_5y(pak_df,   "Pakistan", 1980, ax=axes[2])
plt.show()