In [2]:
import pandas as pd
import numpy as np

from pdparallel import parallel_apply

# Make some data where the values are normally distributed around the group ID

In [45]:
num_row = 1000
num_group = 5
sigma = 0.1

group_id = np.random.choice(np.arange(1, num_group + 1), size=num_row)
values = group_id + sigma*np.random.randn(num_row)/10.
df = pd.DataFrame(dict(group_id=group_id, values=values))

In [46]:
df.head()

Unnamed: 0,group_id,values
0,3,2.985156
1,2,2.003308
2,4,3.987695
3,3,2.990564
4,5,4.992979


In [47]:
df.groupby("group_id").mean()

Unnamed: 0_level_0,values
group_id,Unnamed: 1_level_1
1,1.001211
2,1.999817
3,3.000186
4,3.9999
5,5.000182


# Apply a function where we double the values in each group. We don't really need a groupby here, but whatever.

In [50]:
def double_values(df):
    df["values"] *= 2
    return df

doubled = parallel_apply(grouped=df.groupby("group_id", as_index=False), func=double_values)

# Check if we actually doubled the values
doubled.groupby("group_id").mean()

Unnamed: 0_level_0,values
group_id,Unnamed: 1_level_1
1,2.002422
2,3.999634
3,6.000371
4,7.999801
5,10.000365


# Apply a function where we pass an additional parameter. Again, we don't really need a groupby here, but whatever.

In [51]:
def multiply_values_by_x(df, x):
    df["values"] *= x
    return df

def multiply_values_by_x_wrapper(args):
    return multiply_values_by_x(*args)

tripled = parallel_apply(grouped=df.groupby("group_id", as_index=False), 
                         func=multiply_values_by_x_wrapper,
                         func_args=(3,))

# Check if we actually tripled the values
tripled.groupby("group_id").mean()

Unnamed: 0_level_0,values
group_id,Unnamed: 1_level_1
1,3.003633
2,5.999451
3,9.000557
4,11.999701
5,15.000547
