In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import json

# Compare runtimes of string related operations (numpy vs pandas vs koalas)

In [5]:
import databricks.koalas as ks

In [34]:
# np.array(pd.Series(['1', '3'])).astype('U')

In [35]:
pd_arr = pd.Series(['Aadskbjhdsfbjhgvbds', 'BBKSDJNFJDSFKJKSDBN', 'ACasdkjhasjdasfhgbdfhnadmsnfbahjcsdm,ahiawnfdjkshajfnawcfkenmsx', np.nan] * 100000)
np_arr = pd_arr.values.astype('U')
ks_arr = ks.from_pandas(pd_arr)

In [36]:
%%timeit
np.char.find(np_arr, 'BB')

285 ms ± 32.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
%%timeit
pd_arr.str.contains('BB')

134 ms ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [38]:
%%timeit
ks_arr.str.contains('BB')



7.84 ms ± 1.18 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


**On smaller dataset (100k): Pandas is ~2 times quicker than numpy, and koalas is ~17 times quicker than pandas**

**On larger dataset (1m+): Pandas is ~2 times quicker than numpy, and koalas is ~150 times quicker than pandas**

# Compare runtimes of isna operator (numpy vs pandas vs koalas)

In [39]:
%%timeit
pd.isna(np_arr)

16.5 µs ± 1.04 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [40]:
%%timeit
pd_arr.isna()

10.5 ms ± 445 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [41]:
%%timeit
ks_arr.isna()

3.74 ms ± 439 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


**On smaller dataset (100k): koalas ~ 3 times faster than pandas, numpy ~220 times faster than koalas**

**On larger dataset (1m+): pandas and numpy perform similarly, koalas ~400 times faster**

# Compare runtimes of isin operator (numpy vs pandas vs koalas)

In [42]:
%%timeit
np.isin(np_arr, ['Aadskbjhdsfbjhgvbds', 'A', 'B'])

30.6 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [43]:
%%timeit
pd_arr.isin(['Aadskbjhdsfbjhgvbds', 'A', 'B'])

8.72 ms ± 424 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [44]:
%%timeit
ks_arr.isin(['Aadskbjhdsfbjhgvbds', 'A', 'B'])

4.58 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


**On smaller dataset (100k): pandas is ~3 times quicker than numpy, and koalas is ~2 times quicker than pandas**

**On larger datasets (1m+): pandas is ~2 times quicker than numpy, and koalas is ~200 times quicker than pandas**

# Compare runtimes of standard operators

In [58]:
pd_num_arr = pd.Series([1,2,3,4] * 1000000)
np_num_arr = pd_num_arr.values
ks_num_arr = ks.from_pandas(pd_num_arr)

In [59]:
%%timeit
np_num_arr > 2

3.35 ms ± 86.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [60]:
%%timeit
pd_num_arr > 2

3.53 ms ± 53.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [61]:
%%timeit
ks_num_arr > 2

1.8 ms ± 106 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


**On smaller dataset (100k): pandas is ~4 times quicker than koalas, and numpy is ~2 times quicker than pandas**

**On larger datasets (1m+): numpy and pandas perform similarly, and koalas is ~2 times quicker**

# Compare runtimes on fillna (for IsEmpty condition)

In [6]:
np_arr_test = np.array([np.nan]*10000000)
pd_arr_test = pd.Series(np_arr_test)
ks_arr_test = ks.from_pandas(pd_arr_test)

In [7]:
%timeit np.nan_to_num(np_arr_test, 0)

58.4 ms ± 2.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
%timeit pd_arr_test.fillna(0)

37.9 ms ± 414 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
%timeit ks_arr_test.fillna(0)

6.34 ms ± 638 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
