In [2]:
#%pip install wbdata
import wbdata
import numpy as np
import pandas as pd

# 1. Population DataFrames [A]

In [3]:

def population_df(years, age_range, area) -> pd.DataFrame: 
    """
    years (tuple): (start_year, end_year), use (year, year) if just want singular year
    age_range (tuple): (low_age, high_age)
    area (str): e.g. "India"

    This function takes in year, age_range, and area [country/world] and outputs a DataFrame with relevant information
    """
    country_id = wbdata.get_countries(query=area)[0]["id"]

    low, high = age_range
    ages = range(low, high + 1)

    indicators = {}
    for age in ages:
        age2 = f"{age:02d}"
        indicators[f"SP.POP.AG{age2}.MA.IN"] = f"male_{age2}"
        indicators[f"SP.POP.AG{age2}.FE.IN"] = f"female_{age2}"

    df = wbdata.get_dataframe(indicators, country=country_id, parse_dates=True)

    start, end = years
    df = df[(df.index.year >= start) & (df.index.year <= end)]

    return df

### Population Df of India from 2010, male and female age 1-3

In [4]:
india_df = population_df((2000, 2010), (1, 3), "India")
india_df

Unnamed: 0_level_0,male_01,female_01,male_02,female_02,male_03,female_03
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-01,13495545.0,12298938.0,13415841.5,12215921.0,13456025.5,12244401.0
2009-01-01,13472670.5,12282255.0,13484389.5,12278052.5,13601170.5,12368019.5
2008-01-01,13548239.5,12352226.5,13634321.0,12407041.5,13782866.0,12523308.0
2007-01-01,13705075.0,12488959.0,13820419.0,12567362.0,13918835.0,12647810.0
2006-01-01,13894405.5,12653374.0,13957326.5,12693488.0,14008123.0,12742689.0
2005-01-01,14036289.5,12785347.5,14049018.5,12791509.5,14090686.0,12813192.0
2004-01-01,14133068.5,12889388.5,14134137.5,12865285.5,14053390.5,12752942.5
2003-01-01,14222743.0,12968494.5,14098552.5,12807316.5,13882771.5,12589598.0
2002-01-01,14193578.5,12917390.5,13931333.0,12647920.0,13743147.5,12466298.5
2001-01-01,14029296.5,12761370.5,13792863.5,12526431.5,13663218.5,12399016.5


# 2. Population Statistics [A]

In [5]:
def population(years, sex, age_range, area):
    """
    years (tuple): (start_year, end_year) ,use (year, year) if just want singular year
    sex (str): "Male", "Female", or "All"
    age_range (tuple): (low_age, high_age)
    area (str): e.g. "India", "World"

    This function answers the question: 
    In [year] how many [people/males/females] aged [low] to [high] were living in [the world/region/country]?
    The output is a Series
    """
    df = population_df(years, age_range, area)
    df['male_total'] = df.filter(like='male_').sum(axis=1)
    df['female_total'] = df.filter(like='female_').sum(axis=1)
    df['population_total'] = df['female_total']+ df['male_total']
    
    #only the aggregated df
    agg = df[['male_total', 'female_total', 'population_total']]
    agg = agg.copy()
    agg.index = agg.index.year
    agg.index.name = "year"
    
    if sex == 'Female':
        return agg['female_total']
    elif sex == 'Male':
        return agg['male_total']
    elif sex == 'All':
        return agg['population_total']
    else:
        print("Invalid Input. Please input 'Female', 'Male', or 'All'.")
        
        



In [6]:
india_pop = population((2000, 2010), 'Female', (0, 3), 'India')
india_pop

year
2010    49203332.5
2009    49422844.5
2008    49780946.0
2007    50294608.0
2006    50836337.5
2005    51324745.5
2004    51598504.0
2003    51584271.0
2002    51356932.0
2001    50978052.0
2000    50590984.5
Name: female_total, dtype: float64