# Highly Recommended Options

In [1]:
import numpy as np
np.random.seed(1234)
import pandas as pd
pd.options.display.max_rows=10

Test Data

In [2]:
nrows, ncols = 2000000, 10
def make_df(_):
    return pd.DataFrame(np.random.randn(nrows, ncols))
df1, df2, df3, df4 = [make_df(_) for _ in range(4)]

# numexpr

In [3]:
import numexpr
numexpr.__version__

'2.5.2'

In [4]:
def f_python():
    return (df1 > df2) | (df3 > df4)
def f_numexpr():
    return pd.eval('(df1 > df2) | (df3 > df4)')

In [5]:
f_python().equals(f_numexpr())

True

In [6]:
%timeit f_python()

1 loop, best of 3: 467 ms per loop


In [7]:
%timeit f_numexpr()

10 loops, best of 3: 60.7 ms per loop


## bottleneck

In [8]:
from pandas.core import nanops

In [9]:
nanops._USE_BOTTLENECK = True
%timeit df1.sum()

10 loops, best of 3: 104 ms per loop


In [10]:
nanops._USE_BOTTLENECK = False
%timeit df1.sum()

1 loop, best of 3: 216 ms per loop


In [11]:
%timeit pd.Series(np.nansum(df1))

10 loops, best of 3: 141 ms per loop


YMMV

In [12]:
nanops._USE_BOTTLENECK = True
%timeit df1.sum(1)

10 loops, best of 3: 27.3 ms per loop


In [13]:
nanops._USE_BOTTLENECK = False
%timeit df1.sum(1)

1 loop, best of 3: 229 ms per loop


In [14]:
%timeit pd.Series(np.nansum(df1, axis=1))

10 loops, best of 3: 163 ms per loop
