---
title: Creating custom aggregate functions in Dask
---

# 1. Setup

In [2]:
import dask.dataframe as dd
import pandas as pd

In [73]:
df = pd.DataFrame(
    {
        'Fruit': ['Apple', 'Orange', 'Pear', 'Pear', 'Banana'],
    },
    index=['Franz', 'Hans', 'Hans', 'Gerhard', 'Gerhard'],
)

In [74]:
df

Unnamed: 0,Fruit
Franz,Apple
Hans,Orange
Hans,Pear
Gerhard,Pear
Gerhard,Banana


# 2. Nunique

In [75]:
df.groupby(df.index).Fruit.agg('nunique').to_frame()

Unnamed: 0,Fruit
Franz,1
Gerhard,2
Hans,2


In [76]:
ddf = dd.from_pandas(df, npartitions=3)

In [77]:
ddf

Unnamed: 0_level_0,Fruit
npartitions=2,Unnamed: 1_level_1
Franz,object
Hans,...
Hans,...


In [78]:
nunique = dd.Aggregation(
    'nunique',
    chunk=lambda s: s.nunique(),
    agg=lambda n: n.sum(),
)

In [79]:
ddf.groupby(
    ddf.index
).agg(nunique).Fruit.compute().to_frame()

Unnamed: 0,Fruit
Franz,1
Gerhard,2
Hans,2


# 3. Concatenation

In [125]:
df.groupby(level=0).Fruit.apply(lambda s: ", ".join(s))

Franz             Apple
Gerhard    Pear, Banana
Hans       Orange, Pear
Name: Fruit, dtype: object

In [128]:
ddf.groupby(
    ddf.index
).Fruit.apply(lambda s: ", ".join(s), meta=str).compute()

Gerhard    Pear, Banana
Franz             Apple
Hans       Orange, Pear
dtype: object