# How to make pandas go <font color='green'>fast</font>

In [51]:
pd.set_option('max_rows',12)
import bottleneck as bn
import numexpr as ne
import numpy as np
from numba import jit
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


# computation

In [None]:
df = pd.DataFrame(np.random.randn(10000000,2),columns=list('AB'))

In [55]:
def f_vectorize_numpy(df):
    return (df['A'].values*2 + df['B'].values + 1).sum()
def f_vectorize_pandas(df):
    return (df['A']*2 + df['B'] + 1).sum()
def f_numexpr(df):
    a = df['A'].values
    b = df['B'].values
    return ne.evaluate('sum(a*2 + b + 1)',local_dict = { 'a' : a, 'b' : b }).item()
def f_bottleneck(df):
    return bn.nansum(df['A'].values*2 + df['B'].values + 1)   

In [66]:
%%cython
cdef _cython(double[:] a, double[:] b):
    cdef:
        double result = 0.0
        int i
    
    assert len(a) == len(b)
    for i in range(len(a)):
        result += a[i]*2 + b[i] + 1.0
    return result
        
def f_cython(df):
    return _cython(df['A'].values,df['B'].values)

In [69]:
@jit
def _numba(a, b):
    result = 0.0
    assert len(a) == len(b)
    for i in range(len(a)):
        result += a[i]*2 + b[i] + 1.0
    return result

def f_numba(df):
    return _numba(df['A'].values,df['B'].values)

In [68]:
np.var([f_vectorize_numpy(df),f_vectorize_pandas(df),f_numexpr(df),f_bottleneck(df),f_cython(df),f_numba(df)])

1.8225525255655128e-13

In [70]:
%timeit f_vectorize_numpy(df)

10 loops, best of 3: 107 ms per loop


In [71]:
%timeit f_vectorize_pandas(df)

10 loops, best of 3: 85.3 ms per loop


In [72]:
%timeit f_numexpr(df)

10 loops, best of 3: 37.7 ms per loop


In [73]:
%timeit f_bottleneck(df)

10 loops, best of 3: 110 ms per loop


In [74]:
%timeit f_cython(df)

10 loops, best of 3: 25.3 ms per loop


In [76]:
%timeit f_numba(df)

100 loops, best of 3: 15 ms per loop
