In [1]:
import numpy as np

In [17]:
import pandas as pd
from psutil import cpu_count
from dask import dataframe as dd
from dask.multiprocessing import get
import timeit
import warnings


def pd_apply(df, myfunc, *args, **kwargs):
    def wrapped():
        if type(df) == pd.DataFrame:
            pd.concat([df[c].apply(myfunc, args=args, **kwargs) for c in df.columns], axis=1)
        else:
            df.apply(myfunc, args=args, **kwargs)
    return wrapped


def dask_apply(df, npartitions, myfunc, *args, **kwargs):
    samp = df.iloc[:npartitions]
    
    if type(df) == pd.DataFrame:
        tmp = kwargs.pop('meta')
        meta = {c: tmp[c].dtype for c in tmp.columns}
        try:
            tmp_df = dd.from_pandas(samp, npartitions=npartitions).apply(myfunc, *args, **kwargs, axis=1, meta=meta).compute(get=get)
            assert tmp_df.shape == samp.shape
            return dd.from_pandas(df, npartitions=npartitions).apply(myfunc, *args, **kwargs, axis=1, meta=meta).compute(get=get)
        except AssertionError:
            warnings.warn('Dask applymap not working correctly. Concatenating swiftapplies instead.')
            return pd.concat([swiftapply(df[c], myfunc, *args, **kwargs) for c in df.columns], axis=1)
    else:
        meta = kwargs.pop('meta')
        try:
            tmp_df = dd.from_pandas(samp, npartitions=npartitions).map_partitions(myfunc, *args, **kwargs, meta=meta).compute(get=get)
            assert tmp_df.shape == samp.shape
            return dd.from_pandas(df, npartitions=npartitions).map_partitions(myfunc, *args, **kwargs, meta=meta).compute(get=get)
        except :
            return dd.from_pandas(df, npartitions=npartitions).map(lambda x: myfunc(x, *args, **kwargs), meta=meta).compute(get=get)
        

def swiftapply(df, myfunc, *args, **kwargs):
    """
    Efficiently apply any function to a pandas dataframe or series
    in the fastest available manner
    :param df: The dataframe or series to apply the function to
    :param myfunc: The function you wish to apply
    :param args: The positional arguments of the function
    :param kwargs: The key word arguments of the function
        You can also specify npartitions and dask_threshold
        npartitions will affect the speed of dask multiprocessing
        dask_threshold is the maximum allowed time (in seconds) for a normal pandas apply
            before switching to a dask operation
    :return: The new dataframe/series with the function applied as quickly as possible
    """
    if 'npartitions' in kwargs.keys():
        npartitions = kwargs.pop('npartitions')
    else:
        npartitions = cpu_count() * 2
    if 'dask_threshold' in kwargs.keys():
        dask_threshold = kwargs.pop('dask_threshold')
    else:
        dask_threshold = 1
    
    if myfunc is not str:
        samp = df.iloc[:1000]
        
        try:  # try to vectorize
            if type(df) == pd.DataFrame:
                tmp_df = pd.concat([pd.Series(myfunc(samp[c], *args, **kwargs), name=c) for c in samp.columns], axis=1)
                assert tmp_df.shape == samp.shape
                return pd.concat([pd.Series(myfunc(df[c], *args, **kwargs), name=c) for c in df.columns], axis=1)
            else:
                tmp_df = myfunc(samp, *args, **kwargs)
                assert tmp_df.shape == samp.shape
                return myfunc(df, *args, **kwargs)
            
        except:  # if can't vectorize, estimate time to pandas apply
            wrapped = pd_apply(samp, myfunc, *args, **kwargs)
            n_repeats = 3
            timed = timeit.timeit(wrapped, number=n_repeats)
            samp_proc_est = timed/n_repeats
            est_apply_duration = samp_proc_est / len(samp) * df.shape[0]

            # Get meta information for dask, and check if output is str 
            if type(df) == pd.DataFrame:
                kwargs['meta'] = pd.concat([df.loc[:2, c].apply(myfunc, args=args, **kwargs) for c in df.columns], axis=1)
                str_object = object in kwargs['meta'].dtypes.values
            else:
                kwargs['meta'] = df.iloc[:2].apply(myfunc, args=args, **kwargs)
                str_object = object == kwargs['meta'].dtypes
                
            # if pandas apply takes too long and output is not str, use dask
            if (est_apply_duration > dask_threshold) and (not str_object):
                return dask_apply(df, npartitions, myfunc, *args, **kwargs)
            else:  # use pandas
                kwargs.pop('meta')
                if type(df) == pd.DataFrame:
                    return pd.concat([df[c].apply(myfunc, args=args, **kwargs) for c in df.columns], axis=1)
                else:
                    return df.apply(myfunc, args=args, **kwargs)
    else:
        return df.astype(str)

In [18]:
words = pd.DataFrame({'lower_a': np.repeat(' a ',10000000), 'lower_b': np.repeat('_b_',10000000),
                     'x1': np.random.normal(size=10000000), 'x2': np.random.exponential(size=10000000)})

In [19]:
def get_len(x):
    return len(x)

def get_strlen(x):
    return x.str.len()

def to_upper(x):
    return x.upper()

def nonvectorized_func(x,  y):
    if x**2+ y**2 > 5:
        return True
    else:
        return False
    
    
def vectorized_func(x, y):
    return np.where(x**2 + y**2 > 5, True, False)

In [23]:
%time swiftapply(words['lower_b'], get_len)

CPU times: user 40.7 s, sys: 2.05 s, total: 42.8 s
Wall time: 43 s


0          3
1          3
2          3
3          3
4          3
5          3
6          3
7          3
8          3
9          3
10         3
11         3
12         3
13         3
14         3
15         3
16         3
17         3
18         3
19         3
20         3
21         3
22         3
23         3
24         3
25         3
26         3
27         3
28         3
29         3
          ..
9999970    3
9999971    3
9999972    3
9999973    3
9999974    3
9999975    3
9999976    3
9999977    3
9999978    3
9999979    3
9999980    3
9999981    3
9999982    3
9999983    3
9999984    3
9999985    3
9999986    3
9999987    3
9999988    3
9999989    3
9999990    3
9999991    3
9999992    3
9999993    3
9999994    3
9999995    3
9999996    3
9999997    3
9999998    3
9999999    3
Name: lower_b, Length: 10000000, dtype: int64

In [20]:
%time swiftapply(words[['lower_a', 'lower_b']], get_len)

Exception ignored in: <Finalize object, dead>
Traceback (most recent call last):
  File "/home/jmcarpenter/anaconda2/envs/fastai/lib/python3.6/multiprocessing/util.py", line 186, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/jmcarpenter/anaconda2/envs/fastai/lib/python3.6/multiprocessing/pool.py", line 571, in _terminate_pool
    cls._help_stuff_finish(inqueue, task_handler, len(pool))
  File "/home/jmcarpenter/anaconda2/envs/fastai/lib/python3.6/multiprocessing/pool.py", line 556, in _help_stuff_finish
    inqueue._rlock.acquire()
KeyboardInterrupt: 


AttributeError: 'DataFrame' object has no attribute 'name'