# How to make pandas go <font color='green'>fast</font>

In [1]:
import pandas as pd
import numexpr as ne
import numpy as np
from numba import jit
np.random.seed(1234)

%load_ext Cython

# computation

In [2]:
df = pd.DataFrame(np.random.randn(10000000,2),columns=list('AB'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 2 columns):
A    float64
B    float64
dtypes: float64(2)
memory usage: 152.6 MB


In [3]:
def f_vectorize_numpy(df):
    return (df['A'].values*2 + df['B'].values + 1).sum()
def f_vectorize_pandas(df):
    return (df['A']*2 + df['B'] + 1).sum()
def f_numexpr(df):
    a = df['A'].values
    b = df['B'].values
    return ne.evaluate('sum(a*2 + b + 1)',local_dict = { 'a' : a, 'b' : b }).item()

In [4]:
%%cython
cdef _cython(double[:] a, double[:] b):
    cdef:
        double result = 0.0
        int i
    
    assert len(a) == len(b)
    for i in range(len(a)):
        result += a[i]*2 + b[i] + 1.0
    return result
        
def f_cython(df):
    return _cython(df['A'].values,df['B'].values)

In [5]:
@jit
def _numba(a, b):
    result = 0.0
    assert len(a) == len(b)
    for i in range(len(a)):
        result += a[i]*2 + b[i] + 1.0
    return result

def f_numba(df):
    return _numba(df['A'].values,df['B'].values)

In [6]:
np.var([f_vectorize_numpy(df),f_vectorize_pandas(df),f_numexpr(df),f_cython(df),f_numba(df)])

2.5673907444456745e-15

In [7]:
%timeit f_vectorize_numpy(df)

10 loops, best of 3: 115 ms per loop


In [8]:
%timeit f_vectorize_pandas(df)

10 loops, best of 3: 91 ms per loop


In [9]:
%timeit f_numexpr(df)

10 loops, best of 3: 34.9 ms per loop


In [10]:
%timeit f_cython(df)

10 loops, best of 3: 21.7 ms per loop


In [11]:
%timeit f_numba(df)

100 loops, best of 3: 15.5 ms per loop
