# Grouping


In [None]:
import pandas as pd

In [None]:
# Load the drinks dataset
drinks = pd.read_csv("./data/drinks.csv")
drinks.head()

In [None]:
# Inspecting data to see if there are any missing values. (Yes, there are)
drinks.info()

In [None]:
# Create a boolean filter for rows with any missing values
filt = drinks.isna().any(axis=1)
drinks[filt]

In [None]:
# Fill missing values in the "continent" column with "UNKNOWN"
drinks["continent"] = drinks["continent"].fillna("UNKNOWN")
drinks.info()

In [None]:
# Group the drinks DataFrame by the "continent" column
drinks_grouped = drinks.groupby("continent")

In [None]:
# Get the number of unique continents in the dataset
drinks_grouped.ngroups

In [None]:
# Get the number of rows in each group.
display(drinks_grouped.size())

# Another way to get the number of rows in each group.
# The aggregate function .count() counts only the non-null values from each column,
# whereas .size() simply returns the number of rows available in each group irrespective of presence or absence of values.
display(drinks_grouped.count())

In [None]:
# Get the number of unique values in each column for each continent
display(drinks_grouped[["beer_servings"]].nunique())

In [None]:
# Sample a row from each group
display(drinks_grouped.first())

# Get the last row from each group
display(drinks_grouped.last())

# Get the second row from each group
display(drinks_grouped.nth(1))

In [None]:
# Get the mean beer servings for each continent
drinks_grouped[["beer_servings"]].mean()

In [None]:
# Get the data for a specific group, e.g., Africa ("AF")
drinks_grouped.get_group("AF").head()

In [None]:
# Get the data for an arbitrary group using the group keys
# Create a list of group names
group_list = list(drinks_grouped.groups.keys())

# Get the data for the first group in the list
drinks_grouped.get_group(group_list[0]).head()

In [None]:
# Iterate over each group and display the group name and first row of the group
for key, group in drinks_grouped:
    print(key)
    display(group.head(1))

In [None]:
# Get the mean beer and wine servings for each continent
drinks_grouped[["beer_servings", "wine_servings"]].mean()

In [None]:
# Get descriptive statistics for wine servings for each continent
drinks_grouped[["wine_servings"]].describe()

In [None]:
# Get multiple aggregate statistics for beer and wine servings for each continent
drinks_grouped[["beer_servings", "wine_servings"]].agg(["mean", "min", "max", "median"])

In [None]:
# Get multiple aggregate statistics for several columns for each continent
drinks_grouped.agg(
    {
        "beer_servings": ["min", "max"],
        "wine_servings": ["mean", "median"],
        "spirit_servings": ["mean", "median"],
        "total_litres_of_pure_alcohol": ["mean", "min"],
        "country": ["count"],
    }
)
