In [2]:
import pandas as pd

In [3]:
df = pd.DataFrame({
    "country" : ["United States", "Canada", "United Kingdom", "France", "Germany", "China", "Japan", "Korea"],
    "continent" : ["North America", "North America", "Europe", "Europe", "Europe", "Asia", "Asia", "Asia"],
    "population" : [334712903, 38374721, 68566696, 65549585, 84296021, 1449947840, 125736530, 51353518],
    "area" : [3797000, 3855000, 94058, 210016, 138065, 3705000, 145937, 38691],
    "share of world GDP": [24.08, 2.04, 3.26, 3.19, 4.56, 15.12, 6.02, 1.89]
})
df

Unnamed: 0,country,continent,population,area,share of world GDP
0,United States,North America,334712903,3797000,24.08
1,Canada,North America,38374721,3855000,2.04
2,United Kingdom,Europe,68566696,94058,3.26
3,France,Europe,65549585,210016,3.19
4,Germany,Europe,84296021,138065,4.56
5,China,Asia,1449947840,3705000,15.12
6,Japan,Asia,125736530,145937,6.02
7,Korea,Asia,51353518,38691,1.89


In [4]:
df.groupby("continent")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x114355ba8>

In [5]:
df.groupby("continent").count()

Unnamed: 0_level_0,country,population,area,share of world GDP
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Asia,3,3,3,3
Europe,3,3,3,3
North America,2,2,2,2


In [6]:
df_agg = df.groupby("continent").agg({"country": "count", "population": ["sum", "min", "max"]})
df_agg

Unnamed: 0_level_0,country,population,population,population
Unnamed: 0_level_1,count,sum,min,max
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Asia,3,1627037888,51353518,1449947840
Europe,3,218412302,65549585,84296021
North America,2,373087624,38374721,334712903


In [7]:
print(df_agg.columns)

MultiIndex([(   'country', 'count'),
            ('population',   'sum'),
            ('population',   'min'),
            ('population',   'max')],
           )


In [8]:
print(df_agg[("population", "max")])

continent
Asia             1449947840
Europe             84296021
North America     334712903
Name: (population, max), dtype: int64


In [9]:
df_agg_single = df.groupby("continent").agg(country_count = ("country", "count"),
                            population_sum = ("population", "sum"),
                            population_min = ("population", "min"),
                            population_max = ("population", "max")
                           )
df_agg_single

Unnamed: 0_level_0,country_count,population_sum,population_min,population_max
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Asia,3,1627037888,51353518,1449947840
Europe,3,218412302,65549585,84296021
North America,2,373087624,38374721,334712903


In [12]:
print(df_agg_single.columns)

Index(['country_count', 'population_sum', 'population_min', 'population_max'], dtype='object')


In [13]:
print(df_agg_single["population_max"])

continent
Asia             1449947840
Europe             84296021
North America     334712903
Name: population_max, dtype: int64


In [10]:
def process_continent(continent):
    result = {}
    if continent["share of world GDP"].sum() > 15:
        result["population"] = continent["population"].sum()
        result["population density"] = result["population"] / continent["area"].sum()
    return pd.Series(result, index = ["population", "population density"], dtype = "float64")

df_density = df.groupby("continent").apply(process_continent)
df_density

Unnamed: 0_level_0,population,population density
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Asia,1627038000.0,418.30167
Europe,,
North America,373087600.0,48.756877


In [11]:
df_density.dropna()

Unnamed: 0_level_0,population,population density
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Asia,1627038000.0,418.30167
North America,373087600.0,48.756877
