# 1 Billion Rows

In [14]:
import pandas as pd
import numpy as np
import numexpr as ne

In [2]:
from multiprocessing import cpu_count, Pool
cores = cpu_count()

In [3]:
df_plane_numeric = pd.DataFrame(np.random.uniform(-10, 10, (1000*1000*1000, 1)), columns=['ArrDelay'])
df_plane_numeric.head()

Unnamed: 0,ArrDelay
0,-0.908226
1,-0.229998
2,5.086153
3,0.182247
4,6.155174


## Positive or Negative?

In [4]:
filter_func = lambda x: 1 if x > 0 else 0

In [6]:
%%time
# Map lambda func (original)
df_plane_numeric['ArrDelayBinary1'] = df_plane_numeric['ArrDelay'].map(filter_func)

CPU times: user 5min 59s, sys: 28.8 s, total: 6min 28s
Wall time: 6min 27s


In [5]:
def filter_funcf(dta):
    return 1*(dta>1)

In [7]:
%%time
# Apply conditional (without if-else)
df_plane_numeric['ArrDelayBinary2'] = df_plane_numeric['ArrDelay'].apply(filter_funcf)

CPU times: user 6min 36s, sys: 33 s, total: 7min 9s
Wall time: 7min 9s


In [8]:
%%time
# Condition and cast as int
df_plane_numeric['ArrDelayBinary3'] = 1*(df_plane_numeric['ArrDelay'] > 0)

CPU times: user 4.46 s, sys: 4.78 s, total: 9.24 s
Wall time: 5.16 s


In [9]:
%%time
# Condition and explicitly cast as int
df_plane_numeric['ArrDelayBinary4'] = (df_plane_numeric['ArrDelay'] > 0).astype(int)

CPU times: user 3.08 s, sys: 3.67 s, total: 6.76 s
Wall time: 6.76 s


In [10]:
%%time
# Keep as Bool (why not????)
df_plane_numeric['ArrDelayBinary5'] = df_plane_numeric['ArrDelay'] > 0

CPU times: user 1.01 s, sys: 424 ms, total: 1.43 s
Wall time: 1.43 s


In [11]:
def parallelize(data, func):
    data_split = np.array_split(data, cores)
    pool = Pool(cores)
    out = pool.map(func, data_split)
    data = pd.concat(out)
    pool.close()
    pool.join()
    return data

In [12]:
%%time
# Run in parallel
df_plane_numeric['ArrDelayBinaryMP'] = parallelize(df_plane_numeric['ArrDelay'], filter_funcf)

CPU times: user 20.3 s, sys: 27.7 s, total: 48 s
Wall time: 47.4 s


In [13]:
df_plane_numeric.tail()

Unnamed: 0,ArrDelay,ArrDelayBinary1,ArrDelayBinary2,ArrDelayBinary3,ArrDelayBinary4,ArrDelayBinary5,ArrDelayBinaryMP
999999995,-6.773852,0,0,0,0,False,0
999999996,7.019398,1,1,1,1,True,1
999999997,0.200838,1,0,1,1,True,0
999999998,9.132459,1,1,1,1,True,1
999999999,-8.178324,0,0,0,0,False,0


### Try NumExpr

In [28]:
%%time
a = df_plane_numeric.ArrDelay.values
df_plane_numeric['numexpr'] = ne.evaluate("a > 0")

CPU times: user 3.62 s, sys: 410 ms, total: 4.03 s
Wall time: 989 ms


In [21]:
df_plane_numeric.tail()

Unnamed: 0,ArrDelay,ArrDelayBinary1,ArrDelayBinary2,ArrDelayBinary3,ArrDelayBinary4,ArrDelayBinary5,ArrDelayBinaryMP,numexpr
999999995,-6.773852,0,0,0,0,False,0,False
999999996,7.019398,1,1,1,1,True,1,True
999999997,0.200838,1,0,1,1,True,0,True
999999998,9.132459,1,1,1,1,True,1,True
999999999,-8.178324,0,0,0,0,False,0,False
