In [20]:
%matplotlib inline

In [21]:
from google_drive_downloader import GoogleDriveDownloader as gdd
import vaex
import pandas as pd
import dask
import dask.dataframe as dd
import numpy as np
import multiprocessing
from tqdm import tqdm
import time

In [22]:
n = 10**6
s = np.arange(n).astype(str)
vx_df = vaex.from_arrays(s=s)

In [23]:
file = './data/test10_6.hdf5'
vx_df.export(file, progress=True, shuffle=True)

[########################################]:  100.00% elapsed time  :        0s =  0.0m =  0.0h
  

In [24]:
vx_df = vaex.open(file)
vx_df.executor.buffer_size = len(vx_df) // (multiprocessing.cpu_count() * 2)
pd_df = vx_df.to_pandas_df()
dd_df = dd.from_pandas(pd_df, npartitions=4)

In [25]:
def timeit(expr, n, scope):
    times = []
    for i in range(n):
        s = time.time()
        eval(expr, scope)
        e = time.time()
        times.append(e - s)

    return times

In [26]:
methods = {
	'capitalize': 'df.s.str.capitalize()',
	'cat': 'df.s.str.cat(df.s)',
	'contains': 'df.s.str.contains("9", regex=False)',
	'contains(regex)': 'df.s.str.contains("9", regex=True)',
	'count': 'df.s.str.count("9")',
	'endswith': 'df.s.str.endswith("9")',
	'find': 'df.s.str.find("4")',
	'get': 'df.s.str.get(1)',
	'split+:join': 'df.s.str.split(".").str.join("-")',
	'len': 'df.s.str.len()',
	'ljust': 'df.s.str.ljust(10)',
	'lower': 'df.s.str.lower()',
	'lstrip': 'df.s.str.lstrip("9")',
	'match': 'df.s.str.match("1.*")',
	'pad': 'df.s.str.pad(10)',
	'repeat': 'df.s.str.repeat(2)',
	'replace(:default)': 'df.s.str.replace("123", "321")',
	'replace(:no regex)': 'df.s.str.replace("123", "321", regex=False)',
	'replace(:regex)': 'df.s.str.replace("1?[45]4", "1004", regex=True)',
	'rfind': 'df.s.str.rfind("4")',
	'rjust': 'df.s.str.rjust(10)',
	'rstrip': 'df.s.str.rstrip("9")',
	'slice': 'df.s.str.slice(1, 3)',
	'split': 'df.s.str.split(".")',
	'startswith': 'df.s.str.startswith("9")',
	'strip': 'df.s.str.strip("0")',
	'title': 'df.s.str.title()',
	'upper': 'df.s.str.upper()',
	'zfill': 'df.s.str.zfill(10)'
}

In [None]:
n = 3

performances = {}
for name, expr in tqdm(methods.items()):
    # daskはcompute()で評価
    dask_expr = expr + ".compute()"
    
    # nop() はevaluateして結果を捨てるベンチマーク用のメソッド
    vaex_expr = f"df.evaluate({expr})"

    t_pd = timeit(expr, n, scope={'df': pd_df})
    t_dd = timeit(dask_expr, n, scope={'df': dd_df})
    t_vx = timeit(vaex_expr, n, scope={'df': vx_df})
    
    performance = {
        'pandas': t_pd,
        'dask': t_dd,
        'vaex': t_vx
    }
    print(performance)
    performances[name] = performance


  0%|          | 0/29 [00:00<?, ?it/s][A
  3%|▎         | 1/29 [00:01<00:50,  1.81s/it][A

{'pandas': [0.23424696922302246, 0.23365116119384766, 0.23329997062683105], 'dask': [0.25200533866882324, 0.2503385543823242, 0.2992243766784668], 'vaex': [0.10143065452575684, 0.10077357292175293, 0.1002349853515625]}



  7%|▋         | 2/29 [00:03<00:44,  1.65s/it][A

{'pandas': [0.16308069229125977, 0.15632867813110352, 0.15568065643310547], 'dask': [0.17465543746948242, 0.1755516529083252, 0.17493677139282227], 'vaex': [0.09353375434875488, 0.09304666519165039, 0.09308719635009766]}



 10%|█         | 3/29 [00:03<00:36,  1.39s/it][A

{'pandas': [0.12481307983398438, 0.12304902076721191, 0.12195301055908203], 'dask': [0.12784743309020996, 0.1281728744506836, 0.12812185287475586], 'vaex': [0.015399932861328125, 0.015243768692016602, 0.01526951789855957]}



 14%|█▍        | 4/29 [00:06<00:41,  1.68s/it][A

{'pandas': [0.30130958557128906, 0.29733777046203613, 0.2966008186340332], 'dask': [0.309039831161499, 0.3795909881591797, 0.3103296756744385], 'vaex': [0.14954113960266113, 0.14960980415344238, 0.14961743354797363]}



 17%|█▋        | 5/29 [00:09<00:48,  2.02s/it][A

{'pandas': [0.453413724899292, 0.4499943256378174, 0.45183587074279785], 'dask': [0.45023059844970703, 0.4523427486419678, 0.4852902889251709], 'vaex': [0.019762516021728516, 0.01951146125793457, 0.019431352615356445]}



 21%|██        | 6/29 [00:10<00:40,  1.78s/it][A

{'pandas': [0.19770503044128418, 0.1964578628540039, 0.20101308822631836], 'dask': [0.2064836025238037, 0.20662665367126465, 0.20450496673583984], 'vaex': [0.006644487380981445, 0.0066394805908203125, 0.006482601165771484]}



 24%|██▍       | 7/29 [00:12<00:44,  2.01s/it][A

{'pandas': [0.40308713912963867, 0.40128278732299805, 0.4010121822357178], 'dask': [0.4417264461517334, 0.4383223056793213, 0.4003169536590576], 'vaex': [0.01884174346923828, 0.01844310760498047, 0.018543720245361328]}



 28%|██▊       | 8/29 [00:15<00:44,  2.10s/it][A

{'pandas': [0.35976457595825195, 0.3595418930053711, 0.359022855758667], 'dask': [0.3760373592376709, 0.3876187801361084, 0.3732173442840576], 'vaex': [0.03494596481323242, 0.029731035232543945, 0.029377222061157227]}



 31%|███       | 9/29 [00:21<01:10,  3.52s/it][A

{'pandas': [1.1024730205535889, 1.0242700576782227, 1.2470834255218506], 'dask': [1.0770456790924072, 1.1341497898101807, 0.8723037242889404], 'vaex': [0.12907004356384277, 0.11945104598999023, 0.12254524230957031]}



 34%|███▍      | 10/29 [00:23<00:53,  2.82s/it][A

{'pandas': [0.19522523880004883, 0.1987607479095459, 0.1988358497619629], 'dask': [0.18904972076416016, 0.19322896003723145, 0.19105076789855957], 'vaex': [0.009613275527954102, 0.009302377700805664, 0.009254932403564453]}



 38%|███▊      | 11/29 [00:25<00:45,  2.54s/it][A

{'pandas': [0.25461912155151367, 0.25316762924194336, 0.2523951530456543], 'dask': [0.2744786739349365, 0.2761960029602051, 0.281177282333374], 'vaex': [0.09861111640930176, 0.09110283851623535, 0.09062337875366211]}



 41%|████▏     | 12/29 [00:26<00:37,  2.20s/it][A

{'pandas': [0.18072199821472168, 0.1793351173400879, 0.17775654792785645], 'dask': [0.19736623764038086, 0.19574189186096191, 0.19768095016479492], 'vaex': [0.09921932220458984, 0.09649229049682617, 0.0953223705291748]}



 45%|████▍     | 13/29 [00:27<00:31,  1.95s/it][A

{'pandas': [0.17020559310913086, 0.16909193992614746, 0.16887259483337402], 'dask': [0.18880033493041992, 0.18882989883422852, 0.18727660179138184], 'vaex': [0.09595036506652832, 0.08976888656616211, 0.09055495262145996]}


In [18]:
dfp = pd.DataFrame(performances)

In [19]:
dfp

Unnamed: 0,capitalize,cat,contains,contains(regex),count,endswith,find,get,split+:join,len,...,rfind,rjust,rstrip,slice,split,startswith,strip,title,upper,zfill
dask,"[0.25731801986694336, 0.24950575828552246, 0.2...","[0.18752527236938477, 0.1871662139892578, 0.18...","[0.1306459903717041, 0.13061952590942383, 0.13...","[0.30754780769348145, 0.3069298267364502, 0.30...","[0.4513518810272217, 0.45116090774536133, 0.44...","[0.2051699161529541, 0.20480585098266602, 0.20...","[0.40048718452453613, 0.39728403091430664, 0.3...","[0.3671596050262451, 0.36919569969177246, 0.36...","[0.9432597160339355, 0.8808643817901611, 0.917...","[0.19528436660766602, 0.19721508026123047, 0.1...",...,"[0.40775275230407715, 0.4095950126647949, 0.40...","[0.27143311500549316, 0.2722897529602051, 0.27...","[0.18586969375610352, 0.1855921745300293, 0.18...","[0.20847845077514648, 0.2076883316040039, 0.20...","[0.9155898094177246, 0.7728266716003418, 0.958...","[0.2019946575164795, 0.20117735862731934, 0.20...","[0.1874079704284668, 0.1856400966644287, 0.188...","[0.2509031295776367, 0.2484724521636963, 0.248...","[0.19664263725280762, 0.19481611251831055, 0.1...","[0.2691981792449951, 0.2681879997253418, 0.267..."
pandas,"[0.22765851020812988, 0.22595643997192383, 0.2...","[0.15263962745666504, 0.15285181999206543, 0.1...","[0.12078714370727539, 0.12008404731750488, 0.1...","[0.29783105850219727, 0.2952554225921631, 0.29...","[0.42891454696655273, 0.42777395248413086, 0.4...","[0.1957077980041504, 0.19394445419311523, 0.19...","[0.3796985149383545, 0.3771781921386719, 0.377...","[0.3553457260131836, 0.3522377014160156, 0.352...","[0.8739111423492432, 0.8966310024261475, 0.965...","[0.1888265609741211, 0.17841601371765137, 0.17...",...,"[0.3923373222351074, 0.3899421691894531, 0.389...","[0.24907326698303223, 0.2460780143737793, 0.24...","[0.1636500358581543, 0.16338300704956055, 0.16...","[0.18394112586975098, 0.1832897663116455, 0.18...","[0.8705847263336182, 0.8192665576934814, 0.808...","[0.1947321891784668, 0.19338536262512207, 0.19...","[0.1643390655517578, 0.1641850471496582, 0.162...","[0.22969436645507812, 0.2278749942779541, 0.22...","[0.17649459838867188, 0.17708754539489746, 0.1...","[0.2446901798248291, 0.24320030212402344, 0.24..."
vaex,"[0.01603078842163086, 0.016124486923217773, 0....","[0.014542102813720703, 0.014859199523925781, 0...","[0.015097856521606445, 0.016002416610717773, 0...","[0.016892671585083008, 0.016717910766601562, 0...","[0.01605987548828125, 0.017255544662475586, 0....","[0.019321203231811523, 0.018146276473999023, 0...","[0.016350269317626953, 0.015612602233886719, 0...","[0.015938758850097656, 0.014409780502319336, 0...","[0.03163862228393555, 0.021863460540771484, 0....","[0.016887903213500977, 0.012406349182128906, 0...",...,"[0.015015363693237305, 0.015949487686157227, 0...","[0.018538951873779297, 0.017605304718017578, 0...","[0.016759395599365234, 0.017879486083984375, 0...","[0.01547551155090332, 0.015105962753295898, 0....","[0.019242048263549805, 0.016275882720947266, 0...","[0.01785445213317871, 0.016554832458496094, 0....","[0.01784515380859375, 0.01579904556274414, 0.0...","[0.016451358795166016, 0.01613163948059082, 0....","[0.015186786651611328, 0.016582489013671875, 0...","[0.01840829849243164, 0.01764059066772461, 0.0..."
