In [1]:
import pandas as pd
import numpy as np

from pdparallel import parallel_apply

# Make some data where the values are normally distributed around the group ID

In [2]:
num_row = 1000
num_group = 5
sigma = 0.1

group_id = np.random.choice(np.arange(1, num_group + 1), size=num_row)
values = group_id + sigma*np.random.randn(num_row)/10.
df = pd.DataFrame(dict(group_id=group_id, values=values))

In [3]:
df.head()

Unnamed: 0,group_id,values
0,2,1.995721
1,3,2.987607
2,4,4.011265
3,2,2.001046
4,1,0.999583


In [4]:
df.groupby("group_id").mean()

Unnamed: 0_level_0,values
group_id,Unnamed: 1_level_1
1,1.000553
2,2.000562
3,2.99863
4,3.999754
5,5.000016


# Apply a function where we double the values in each group. We don't really need a groupby here, but whatever.

In [5]:
def double_values(df):
    df["values"] *= 2
    return df

doubled = parallel_apply(grouped=df.groupby("group_id", as_index=False), func=double_values)

# Check if we actually doubled the values
doubled.groupby("group_id").mean()

Unnamed: 0_level_0,values
group_id,Unnamed: 1_level_1
1,2.001106
2,4.001124
3,5.99726
4,7.999508
5,10.000032


# Now, apply a function where we pass an additional parameter to the function. To do this, we need to create a wrapper function. Again, we don't really need a groupby here, but whatever.

In [6]:
def multiply_values_by_x(df, x):
    df["values"] *= x
    return df

def multiply_values_by_x_wrapper(args):
    return multiply_values_by_x(*args)

tripled = parallel_apply(grouped=df.groupby("group_id", as_index=False), 
                         func=multiply_values_by_x_wrapper,
                         func_args=(3,))

# Check if we actually tripled the values
tripled.groupby("group_id").mean()

Unnamed: 0_level_0,values
group_id,Unnamed: 1_level_1
1,3.001659
2,6.001686
3,8.99589
4,11.999262
5,15.000048
