# Aggregations and other groupwise operations - Exercises

# Preparations

In [1]:
import pandas as pd

pd.set_option("display.max_columns", 500)

# Exercise 1

1. Load the first sheet of the Excel file "wdi_reduced.xlsx" into a pandas DataFrame (see [here](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html) for help with `pandas.read_excel()`)
2. Run the `describe()` method for the whole DataFrame.
3. Calculate the mean of all numeric columns over the whole DataFrame.
4. Reproduce the result from `describe()` using the `agg()` method and the appropriate built-in aggregation functions (you may skip the percentiles; including those is a **BONUS!**).
5. BONUS: Produce an aggregated DataFrame that counts the values for all columns (including the non-numerical columns) and also includes the mean and median for the numeric columns.
6. BONUS: Define your own function that counts the string values starting with the (capital) letter 'E'. Apply it using the `agg()` method for all string columns.

## 1. Load the first sheet of the Excel file "wdi_reduced.xlsx" into a pandas DataFrame (see [here](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html) for help with `pandas.read_excel()`)

In [2]:
df = pd.read_excel("../../data/raw/wdi_reduced.xlsx", sheet_name="wdi")
df.head()

Unnamed: 0,countrycode,countryname,region,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
0,ABW,Aruba,Latin America & Caribbean,2007,2623726000.0,-3.654626,101220.0
1,ABW,Aruba,Latin America & Caribbean,2011,2584464000.0,,102053.0
2,ABW,Aruba,Latin America & Caribbean,1992,,,68235.0
3,ABW,Aruba,Latin America & Caribbean,1989,,,61032.0
4,ABW,Aruba,Latin America & Caribbean,1975,,,60657.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12586 entries, 0 to 12585
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   countrycode        12586 non-null  object 
 1   countryname        12586 non-null  object 
 2   region             12586 non-null  object 
 3   year               12586 non-null  int64  
 4   NY_GDP_MKTP_CD     9217 non-null   float64
 5   NY_GDP_MKTP_KD_ZG  8854 non-null   float64
 6   SP_POP_TOTL        12263 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 688.4+ KB


## 2. Run the `describe()` method for the whole DataFrame.

In [4]:
df.describe()

Unnamed: 0,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
count,12586.0,9217.0,8854.0,12263.0
mean,1988.5,158368100000.0,3.882721,23792960.0
std,16.741334,836635800000.0,6.388015,99857930.0
min,1960.0,8824448.0,-64.047107,4279.0
25%,1974.0,1243469000.0,1.416333,449123.5
50%,1988.5,6624068000.0,3.889729,4046901.0
75%,2003.0,44010160000.0,6.416794,12923220.0
max,2017.0,18569100000000.0,149.972963,1378665000.0


## 3. Calculate the mean of all numeric columns over the whole DataFrame.

In [5]:
df.select_dtypes(["int", "float"]).mean()

year                 1.988500e+03
NY_GDP_MKTP_CD       1.583681e+11
NY_GDP_MKTP_KD_ZG    3.882721e+00
SP_POP_TOTL          2.379296e+07
dtype: float64

In [6]:
df.mean(numeric_only=True)

year                 1.988500e+03
NY_GDP_MKTP_CD       1.583681e+11
NY_GDP_MKTP_KD_ZG    3.882721e+00
SP_POP_TOTL          2.379296e+07
dtype: float64

## 4. Reproduce the result from `describe()` using the `agg()` method and the appropriate built-in aggregation functions (you may skip the percentiles; including those is a **BONUS!**).

In [7]:
# for reference: describe()
df.describe()

Unnamed: 0,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
count,12586.0,9217.0,8854.0,12263.0
mean,1988.5,158368100000.0,3.882721,23792960.0
std,16.741334,836635800000.0,6.388015,99857930.0
min,1960.0,8824448.0,-64.047107,4279.0
25%,1974.0,1243469000.0,1.416333,449123.5
50%,1988.5,6624068000.0,3.889729,4046901.0
75%,2003.0,44010160000.0,6.416794,12923220.0
max,2017.0,18569100000000.0,149.972963,1378665000.0


In [8]:
# now with agg(), not including percentiles
df.select_dtypes(["int", "float"]).agg(["count", "mean", "std", "min", "max"])

Unnamed: 0,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
count,12586.0,9217.0,8854.0,12263.0
mean,1988.5,158368100000.0,3.882721,23792960.0
std,16.741334,836635800000.0,6.388015,99857930.0
min,1960.0,8824448.0,-64.047107,4279.0
max,2017.0,18569100000000.0,149.972963,1378665000.0


In [9]:
# including percentiles by using specific functions
def p25(x):
    return x.quantile(0.25)


def p75(x):
    return x.quantile(0.75)


df.select_dtypes(["int", "float"]).agg(["count", "mean", "std", "min", p25, "median", p75, "max"])

Unnamed: 0,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
count,12586.0,9217.0,8854.0,12263.0
mean,1988.5,158368100000.0,3.882721,23792960.0
std,16.741334,836635800000.0,6.388015,99857930.0
min,1960.0,8824448.0,-64.047107,4279.0
p25,1974.0,1243469000.0,1.416333,449123.5
median,1988.5,6624068000.0,3.889729,4046901.0
p75,2003.0,44010160000.0,6.416794,12923220.0
max,2017.0,18569100000000.0,149.972963,1378665000.0


In [10]:
# finally, with same row names as describe():
df.select_dtypes(["int", "float"]).agg(
    ["count", "mean", "std", "min", p25, "median", p75, "max"]
).rename({"p25": "25%", "median": "50%", "p75": "75%"}, axis=0)

Unnamed: 0,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
count,12586.0,9217.0,8854.0,12263.0
mean,1988.5,158368100000.0,3.882721,23792960.0
std,16.741334,836635800000.0,6.388015,99857930.0
min,1960.0,8824448.0,-64.047107,4279.0
25%,1974.0,1243469000.0,1.416333,449123.5
50%,1988.5,6624068000.0,3.889729,4046901.0
75%,2003.0,44010160000.0,6.416794,12923220.0
max,2017.0,18569100000000.0,149.972963,1378665000.0


## 5. BONUS: Produce an aggregated DataFrame that counts the values for all columns (including the non-numerical columns) and also includes the mean and median for the numeric columns.

In [11]:
df.agg(
    {
        "countrycode": "count",
        "countryname": "count",
        "region": "count",
        "year": ["count", "mean", "median"],
        "NY_GDP_MKTP_CD": ["count", "mean", "median"],
        "NY_GDP_MKTP_KD_ZG": ["count", "mean", "median"],
        "SP_POP_TOTL": ["count", "mean", "median"],
    }
)

Unnamed: 0,countrycode,countryname,region,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
count,12586.0,12586.0,12586.0,12586.0,9217.0,8854.0,12263.0
mean,,,,1988.5,158368100000.0,3.882721,23792960.0
median,,,,1988.5,6624068000.0,3.889729,4046901.0


In [12]:
# a little less code using dictionary comprehensions
df.agg(
    {
        **{v: "count" for v in ["countrycode", "countryname", "region"]},
        **{
            v: ["count", "mean", "median"]
            for v in ["year", "NY_GDP_MKTP_CD", "NY_GDP_MKTP_KD_ZG", "SP_POP_TOTL"]
        },
    }
)

Unnamed: 0,countrycode,countryname,region,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
count,12586.0,12586.0,12586.0,12586.0,9217.0,8854.0,12263.0
mean,,,,1988.5,158368100000.0,3.882721,23792960.0
median,,,,1988.5,6624068000.0,3.889729,4046901.0


In [13]:
# even less:
df.agg(
    {
        **{v: "count" for v in df.select_dtypes("object").columns},
        **{v: ["count", "mean", "median"] for v in df.select_dtypes(["int", "float"]).columns},
    }
)

Unnamed: 0,countrycode,countryname,region,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
count,12586.0,12586.0,12586.0,12586.0,9217.0,8854.0,12263.0
mean,,,,1988.5,158368100000.0,3.882721,23792960.0
median,,,,1988.5,6624068000.0,3.889729,4046901.0


## 6. BONUS: Define your own function that counts the string values starting with the (capital) letter 'E'. Apply it using the `agg()` method for all string columns.

In [14]:
def starts_with_e(x):
    return x.str.startswith("E").sum()


df.select_dtypes("object").agg(starts_with_e)

countrycode     348
countryname     406
region         5510
dtype: int64

# Exercise 2

1. Load the first sheet of the Excel file "wdi_reduced.xlsx" into a pandas DataFrame (see [here](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html) for help with `pandas.read_excel()`)
2. `describe()` the DataFrame, grouped by *region*.
3. Using the `agg()` method, summarize the data by *region*. We are interested in the number of values for all columns, and the mean and standard deviation of the numeric columns
4. Calculate a new column containing the ratio of the population (*SP_POP_TOTL*) compared to the maximum in the respective *year*.
5. BONUS: Using the `agg()` method, summarize the data by *region*. We are interested in the 5% quantile and the 95% quantile. **Hint**: you could define your own functions to calculate the specific quantiles.
6. BONUS: Calculate the number of NaN values in the columns *NY_GDP_MKTP_CD*, *NY_GDP_MKTP_KD_ZG*, and *SP_POP_TOTL*, grouped by *region*.

## 1. Load the first sheet of the Excel file "wdi_reduced.xlsx" into a pandas DataFrame (see [here](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html) for help with `pandas.read_excel()`)

In [15]:
df = pd.read_excel("../../data/raw/wdi_reduced.xlsx", sheet_name="wdi")
df.head()

Unnamed: 0,countrycode,countryname,region,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
0,ABW,Aruba,Latin America & Caribbean,2007,2623726000.0,-3.654626,101220.0
1,ABW,Aruba,Latin America & Caribbean,2011,2584464000.0,,102053.0
2,ABW,Aruba,Latin America & Caribbean,1992,,,68235.0
3,ABW,Aruba,Latin America & Caribbean,1989,,,61032.0
4,ABW,Aruba,Latin America & Caribbean,1975,,,60657.0


## 2. `describe()` the DataFrame, grouped by *region*.

In [16]:
df.groupby("region").describe()

Unnamed: 0_level_0,year,year,year,year,year,year,year,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,NY_GDP_MKTP_KD_ZG,NY_GDP_MKTP_KD_ZG,NY_GDP_MKTP_KD_ZG,NY_GDP_MKTP_KD_ZG,NY_GDP_MKTP_KD_ZG,NY_GDP_MKTP_KD_ZG,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL,SP_POP_TOTL,SP_POP_TOTL,SP_POP_TOTL,SP_POP_TOTL,SP_POP_TOTL,SP_POP_TOTL,SP_POP_TOTL
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2
East Asia & Pacific,2146.0,1988.5,16.744571,1960.0,1974.0,1988.5,2003.0,2017.0,1448.0,238391400000.0,960716400000.0,8824448.0,484424100.0,4383461000.0,72439780000.0,11199150000000.0,1390.0,4.530997,5.659244,-27.27,1.654895,4.639003,7.483447,45.302754,2109.0,46172180.0,178894600.0,4433.0,101413.0,859950.0,19153000.0,1378665000.0
Europe & Central Asia,3364.0,1988.5,16.743158,1960.0,1974.0,1988.5,2003.0,2017.0,2087.0,229906700000.0,524422400000.0,69520030.0,6332447000.0,34160360000.0,185259600000.0,3879277000000.0,2066.0,3.0872,5.471268,-44.9,1.33806,3.284224,5.539815,88.957665,3276.0,14089750.0,24665780.0,13411.0,1552165.75,4622745.5,10357660.0,148689000.0
Latin America & Caribbean,2436.0,1988.5,16.744106,1960.0,1974.0,1988.5,2003.0,2017.0,1858.0,53705640000.0,199902200000.0,12366560.0,828036000.0,4624677000.0,22074200000.0,2616202000000.0,1768.0,3.418523,4.612357,-26.478789,1.133493,3.798802,6.050425,26.139296,2356.0,10364090.0,26456120.0,4279.0,103990.75,2159348.5,7890723.0,207652900.0
Middle East & North Africa,1218.0,1988.5,16.747545,1960.0,1974.0,1988.5,2003.0,2017.0,926.0,55929020000.0,98455180000.0,63287590.0,5169734000.0,18090440000.0,56722680000.0,756350300000.0,856.0,5.054348,9.140281,-64.047107,2.114933,4.479855,7.420791,81.887797,1164.0,12140540.0,17258380.0,47384.0,1331917.25,4837217.0,16471090.0,95688680.0
North America,174.0,1988.5,16.788983,1960.0,1974.0,1988.5,2003.0,2017.0,168.0,2507081000000.0,4506773000000.0,84466650.0,3873042000.0,553300000000.0,1828874000000.0,18569100000000.0,165.0,3.04318,2.940144,-5.285412,1.770819,3.141219,4.493475,14.363636,171.0,92359270.0,115004500.0,44400.0,63118.5,26895000.0,212881500.0,323127500.0
South Asia,464.0,1988.5,16.758738,1960.0,1974.0,1988.5,2003.0,2017.0,397.0,93421400000.0,291343300000.0,42463580.0,1801345000.0,8090089000.0,51270570000.0,2263523000000.0,344.0,5.130971,3.759699,-13.973729,3.483716,5.034457,6.687857,28.696265,456.0,139487600.0,290197900.0,89887.0,6946704.5,19155500.0,101100200.0,1324171000.0
Sub-Saharan Africa,2784.0,1988.5,16.743676,1960.0,1974.0,1988.5,2003.0,2017.0,2333.0,10638590000.0,38682700000.0,9122751.0,693573700.0,2160641000.0,7034220000.0,568498900000.0,2265.0,4.001642,7.73745,-51.030865,1.149616,3.962519,6.64237,149.972963,2731.0,11090730.0,18748830.0,41700.0,1494202.0,5125821.0,12112300.0,185989600.0


## 3. Using the `agg()` method, summarize the data by *region*. We are interested in the number of values for all columns, and the mean and standard deviation of the numeric columns

In [17]:
df.groupby("region").agg(
    {
        **{v: "count" for v in ["countrycode", "countryname", "region"]},
        **{
            v: ["mean", "std"]
            for v in ["year", "NY_GDP_MKTP_CD", "NY_GDP_MKTP_KD_ZG", "SP_POP_TOTL"]
        },
    }
)

Unnamed: 0_level_0,countrycode,countryname,region,year,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL,SP_POP_TOTL
Unnamed: 0_level_1,count,count,count,mean,std,mean,std,mean,std,mean,std
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
East Asia & Pacific,2146,2146,2146,1988.5,16.744571,238391400000.0,960716400000.0,4.530997,5.659244,46172180.0,178894600.0
Europe & Central Asia,3364,3364,3364,1988.5,16.743158,229906700000.0,524422400000.0,3.0872,5.471268,14089750.0,24665780.0
Latin America & Caribbean,2436,2436,2436,1988.5,16.744106,53705640000.0,199902200000.0,3.418523,4.612357,10364090.0,26456120.0
Middle East & North Africa,1218,1218,1218,1988.5,16.747545,55929020000.0,98455180000.0,5.054348,9.140281,12140540.0,17258380.0
North America,174,174,174,1988.5,16.788983,2507081000000.0,4506773000000.0,3.04318,2.940144,92359270.0,115004500.0
South Asia,464,464,464,1988.5,16.758738,93421400000.0,291343300000.0,5.130971,3.759699,139487600.0,290197900.0
Sub-Saharan Africa,2784,2784,2784,1988.5,16.743676,10638590000.0,38682700000.0,4.001642,7.73745,11090730.0,18748830.0


## 4. Calculate a new column containing the ratio of the population (*SP_POP_TOTL*) compared to the maximum in the respective *year*.

In [18]:
df["pop_ratio"] = df["SP_POP_TOTL"] / df.groupby("year")["SP_POP_TOTL"].transform("max")
df.loc[df["year"] == 2010].sort_values("SP_POP_TOTL", ascending=False).head(5)

Unnamed: 0,countrycode,countryname,region,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL,pop_ratio
2088,CHN,China,East Asia & Pacific,2010,6100620000000.0,10.63614,1337705000.0,1.0
5200,IND,India,South Asia,2010,1656617000000.0,10.259963,1230981000.0,0.920218
11831,USA,United States,North America,2010,14964370000000.0,2.531921,309348200.0,0.231253
5068,IDN,Indonesia,East Asia & Pacific,2010,755094200000.0,6.223854,242524100.0,0.181299
1527,BRA,Brazil,Latin America & Caribbean,2010,2208872000000.0,7.528224,196796300.0,0.147115


## 5. BONUS: Using the `agg()` method, summarize the data by *region*. We are interested in the 5% quantile and the 95% quantile. **Hint**: you could define your own functions to calculate the specific quantiles.

In [19]:
# including percentiles by using specific functions
def p5(x):
    return x.quantile(0.05)


def p95(x):
    return x.quantile(0.95)


df.groupby("region").agg({v: [p5, p95] for v in df.select_dtypes(["int", "float"]).columns})

Unnamed: 0_level_0,year,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL,SP_POP_TOTL,pop_ratio,pop_ratio
Unnamed: 0_level_1,p5,p95,p5,p95,p5,p95,p5,p95,p5,p95
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
East Asia & Pacific,1962.0,2015.0,56111160.0,1023007000000.0,-3.985153,12.711933,10317.2,127768200.0,9e-06,0.140443
Europe & Central Asia,1962.0,2015.0,1017975000.0,1318921000000.0,-5.183398,9.799515,30331.75,60522680.0,2.7e-05,0.059931
Latin America & Caribbean,1962.0,2015.0,111898000.0,257528800000.0,-4.296577,10.25731,17069.75,44436690.0,1.5e-05,0.034751
Middle East & North Africa,1962.0,2015.0,723693300.0,234425000000.0,-6.213669,17.862145,224928.8,52847450.0,0.000262,0.048347
North America,1962.65,2014.35,158531900.0,14457020000000.0,-2.434062,7.086264,53000.0,302662600.0,4.9e-05,0.256875
South Asia,1962.15,2014.85,248930300.0,392188900000.0,-0.410089,10.681569,174757.0,928594400.0,0.000171,0.78574
Sub-Saharan Africa,1962.0,2015.0,117195700.0,35822370000.0,-5.385537,13.601857,246877.5,43615330.0,0.000292,0.035629


## 6. BONUS: Calculate the number of NaN values in the columns *NY_GDP_MKTP_CD*, *NY_GDP_MKTP_KD_ZG*, and *SP_POP_TOTL*, grouped by *region*.

In [20]:
# including percentiles by using specific functions
def countna(x):
    return x.isna().sum()


df.groupby("region").agg(
    {v: countna for v in ["NY_GDP_MKTP_CD", "NY_GDP_MKTP_KD_ZG", "SP_POP_TOTL"]}
)

Unnamed: 0_level_0,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East Asia & Pacific,698,756,37
Europe & Central Asia,1277,1298,88
Latin America & Caribbean,578,668,80
Middle East & North Africa,292,362,54
North America,6,9,3
South Asia,67,120,8
Sub-Saharan Africa,451,519,53
