In [None]:
# load the following national variables from the World Bank database using -wbgapi-
#    BAR.SCHL.2024 - average years of education completed among people age 20-24
#    BAR.SCHL.2024.FE - average years of education completed among people age 20-24
#    SP.POP.TOTL - total population
#    for years 1960, 1980, 2000, 2010
#    hint: use -wbgapi- options -skipAggs=True- and -columns='series'-
# calculate female education as a percent of male education, age 20-24
#    note that since females and males are ~50% of the population,
#       BAR.SCHL.2024 = 1/2*BAR.SCHL.2024.FE + 1/2*BAR.SCHL.2024.MA
#    since male education ≡ BAR.SCHL.2024.MA isn't in the database, 
#       BAR.SCHL.2024.MA = 2*BAR.SCHL.2024 - BAR.SCHL.2024.FE
# create a -pivot_table- of average female % of male education for 
#    different income level countries in rows, and
#    4 different time periods in columns
# save as an HTML file

In [23]:
# load the pandas and numpy packages to work with data
import pandas as pd
import numpy as np
# load the "wbgapi" package to download WDI data series
import wbgapi as wb

In [30]:
# examples for -wbgapi- commands at https://pypi.org/project/wbgapi/
# download WDI data series for national CO2 emmissions per person & population into a dataframe
df = wb.data.DataFrame(['EN.ATM.CO2E.PC','SP.POP.TOTL'], time=2020, skipAggs=True, db=2)
# skipAggs=True skips entries for regional aggregates (i.e. only country observations)
# columns='series' would put multiple years in rows rather than in separate columns
df

Unnamed: 0_level_0,EN.ATM.CO2E.PC,SP.POP.TOTL
economy,Unnamed: 1_level_1,Unnamed: 2_level_1
ABW,,106585.0
AFG,0.223479,38972230.0
AGO,0.592743,33428486.0
ALB,1.544550,2837849.0
AND,5.777148,77700.0
...,...,...
XKX,,1790133.0
YEM,0.308515,32284046.0
ZAF,6.687563,58801927.0
ZMB,0.401903,18927715.0


In [152]:
# examples for -wbgapi- commands at https://pypi.org/project/wbgapi/
# download WDI data series for national CO2 emmissions per person & population into a dataframe
df = wb.data.DataFrame(['BAR.SCHL.2024','BAR.SCHL.2024.FE', 'SP.POP.TOTL'], time=[1960, 1980, 2000, 2010], skipAggs=True, db=12)
# skipAggs=True skips entries for regional aggregates (i.e. only country observations)
# columns='series' would put multiple years in rows rather than in separate columns
df

Unnamed: 0_level_0,Unnamed: 1_level_0,YR1960,YR1980,YR2000,YR2010
economy,series,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ABW,BAR.SCHL.2024,,,,
ABW,BAR.SCHL.2024.FE,,,,
ABW,SP.POP.TOTL,,60096.00,90853.00,101669.00
AFG,BAR.SCHL.2024,0.55,2.22,3.66,4.58
AFG,BAR.SCHL.2024.FE,0.09,0.73,1.80,2.33
...,...,...,...,...,...
ZMB,BAR.SCHL.2024.FE,2.70,5.28,7.11,7.05
ZMB,SP.POP.TOTL,,5851825.00,10415944.00,13605984.00
ZWE,BAR.SCHL.2024,3.29,6.34,9.03,8.71
ZWE,BAR.SCHL.2024.FE,2.48,5.11,8.85,8.85


In [154]:
display(df.index)

MultiIndex([('ABW',    'BAR.SCHL.2024'),
            ('ABW', 'BAR.SCHL.2024.FE'),
            ('ABW',      'SP.POP.TOTL'),
            ('AFG',    'BAR.SCHL.2024'),
            ('AFG', 'BAR.SCHL.2024.FE'),
            ('AFG',      'SP.POP.TOTL'),
            ('AGO',    'BAR.SCHL.2024'),
            ('AGO', 'BAR.SCHL.2024.FE'),
            ('AGO',      'SP.POP.TOTL'),
            ('AIA',    'BAR.SCHL.2024'),
            ...
            ('YEM',      'SP.POP.TOTL'),
            ('ZAF',    'BAR.SCHL.2024'),
            ('ZAF', 'BAR.SCHL.2024.FE'),
            ('ZAF',      'SP.POP.TOTL'),
            ('ZMB',    'BAR.SCHL.2024'),
            ('ZMB', 'BAR.SCHL.2024.FE'),
            ('ZMB',      'SP.POP.TOTL'),
            ('ZWE',    'BAR.SCHL.2024'),
            ('ZWE', 'BAR.SCHL.2024.FE'),
            ('ZWE',      'SP.POP.TOTL')],
           names=['economy', 'series'], length=678)

This shows us that there is a multindex (nested index). 
```
        level=0    level=1 
                   (aka level=-1) last nested index
names=['economy', 'series'], length=678)
```

In [149]:

df2 = df.unstack(level=1).stack(level=0, future_stack=True).reset_index().rename_axis(None, axis="columns")
df2['year'] = df2['level_1'].str.extract(r'YR(\d{4})')
df2.drop(columns=['level_1'], inplace=True)
df2


Unnamed: 0,economy,BAR.SCHL.2024,BAR.SCHL.2024.FE,SP.POP.TOTL,year
0,ABW,,,,1960
1,ABW,,,60096.0,1980
2,ABW,,,90853.0,2000
3,ABW,,,101669.0,2010
4,AFG,0.55,0.09,,1960
...,...,...,...,...,...
899,ZMB,7.35,7.05,13605984.0,2010
900,ZWE,3.29,2.48,,1960
901,ZWE,6.34,5.11,7408624.0,1980
902,ZWE,9.03,8.85,11881477.0,2000


In [5]:
import wbdata

In [28]:
df = pd.DataFrame()
for year in [1960, 1980, 2000, 2010]:
    display(wbdata.get_dataframe({"BAR.SCHL.2024": "edu_avg_20_24", "BAR.SCHL.2024.FE": "fe_edu_avg_20_24"}, date=str(year)).head())

Unnamed: 0_level_0,edu_avg_20_24,fe_edu_avg_20_24
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Anguilla,,
Cook Islands,,
Global Partnership for Education,,
Lending category not classified,,
Mayotte,,


Unnamed: 0_level_0,edu_avg_20_24,fe_edu_avg_20_24
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Anguilla,,
Cook Islands,,
Global Partnership for Education,,
Lending category not classified,,
Mayotte,,


Unnamed: 0_level_0,edu_avg_20_24,fe_edu_avg_20_24
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Anguilla,,
Cook Islands,,
Global Partnership for Education,,
Lending category not classified,,
Mayotte,,


Unnamed: 0_level_0,edu_avg_20_24,fe_edu_avg_20_24
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Anguilla,,
Cook Islands,,
Global Partnership for Education,,
Lending category not classified,,
Mayotte,,
