In [1]:
import pandas as pd
import numpy as np

from pdparallel import parallel_apply

# Make some data where the values are normally distributed around the group ID

In [2]:
num_row = 1000
num_group = 5
sigma = 0.1

group_id = np.random.choice(np.arange(1, num_group + 1), size=num_row)
values = group_id + sigma*np.random.randn(num_row)/10.
df = pd.DataFrame(dict(group_id=group_id, values=values))

In [3]:
df.head()

Unnamed: 0,group_id,values
0,5,4.998142
1,2,2.011411
2,1,1.004397
3,1,1.011945
4,5,4.982106


In [4]:
df.groupby("group_id").mean()

Unnamed: 0_level_0,values
group_id,Unnamed: 1_level_1
1,0.999237
2,2.000991
3,3.000515
4,4.001476
5,4.999756


# Apply a function where we double the values in each group. We don't really need a groupby here, but whatever.

In [5]:
def double_values(df):
    df["values"] *= 2
    return df

doubled = parallel_apply(grouped=df.groupby("group_id", as_index=False), func=double_values)

# Check if we actually doubled the values
doubled.groupby("group_id").mean()

Unnamed: 0_level_0,values
group_id,Unnamed: 1_level_1
1,1.998473
2,4.001981
3,6.00103
4,8.002952
5,9.999512


# Now, apply a function where we pass an additional parameter to the function. To do this, we need to create a wrapper function. Again, we don't really need a groupby here, but whatever.

In [6]:
def multiply_values_by_x(df, x):
    df["values"] *= x
    return df

def multiply_values_by_x_wrapper(args):
    return multiply_values_by_x(*args)

tripled = parallel_apply(grouped=df.groupby("group_id", as_index=False), 
                         func=multiply_values_by_x_wrapper,
                         func_args=(3,))

# Check if we actually tripled the values
tripled.groupby("group_id").mean()

Unnamed: 0_level_0,values
group_id,Unnamed: 1_level_1
1,2.99771
2,6.002972
3,9.001546
4,12.004427
5,14.999268
