A short sample notebook to compare the performance of pandas (1.3.4 was used) vs ArcticDB (4.5.0) in reading from /writing to local disk/query operations 

Sources:
https://docs.arcticdb.io/4.5.0/
https://medium.com/arcticdb/why-you-should-use-arcticdb-instead-of-csv-to-save-your-pandas-dataframes-ac4d06e55724

In [1]:
import os
import datetime as dt
import time

import numpy as np
import pandas as pd

import arcticdb as adb

In [2]:
# this will set up the storage using the local file system
uri = "lmdb://C:dev/tmp/arcticdb_intro"
ac = adb.Arctic(uri)
ITERATIONS_N = 3  # how many times to perform each operation in sampling execution time

In [3]:
if ac.has_library('intro'):
    ac.delete_library('intro')

In [4]:
ac.create_library('intro')  # static schema
ac.list_libraries()

['intro']

In [5]:
library = ac['intro']

In [6]:
n_row = int(5e6)
cols = ['COL_%d' % i for i in range(50)]
df_big = pd.DataFrame(np.random.randint(0, 50, size=(n_row, 50)), columns=cols)
df_big.index = pd.date_range(dt.datetime(1970, 1, 1, 5), periods=n_row, freq="S")

In [7]:
df_big.tail(4)

Unnamed: 0,COL_0,COL_1,COL_2,COL_3,COL_4,COL_5,COL_6,COL_7,COL_8,COL_9,...,COL_40,COL_41,COL_42,COL_43,COL_44,COL_45,COL_46,COL_47,COL_48,COL_49
1970-02-28 01:53:16,13,22,24,43,36,30,46,6,24,39,...,48,9,4,40,41,5,16,26,23,28
1970-02-28 01:53:17,30,43,15,7,17,21,41,41,3,28,...,37,9,25,29,16,9,31,25,44,39
1970-02-28 01:53:18,47,9,47,36,32,47,5,32,12,23,...,28,39,18,18,29,19,0,6,49,39
1970-02-28 01:53:19,42,10,38,38,27,24,18,10,26,16,...,21,8,23,38,4,3,5,43,43,0


In [8]:
df_big.shape

(5000000, 50)

### Testing output time to C: drive

In [9]:
samples = []
for _itr in range(ITERATIONS_N):
    s = time.time()
    df_big.to_csv("df_big.csv")
    e = time.time()
    samples.append((e-s))
print(f'pd.to_csv( )  time  {"{:.6f}".format(np.mean(samples))} seconds (avg over {ITERATIONS_N} iterations)')

pd.to_csv( )  time  33.142617 seconds (over 3 iterations)


In [10]:
if library.has_symbol('df_big'):
    library.delete('df_big')
    
samples = []
for _itr in range(ITERATIONS_N):
    s = time.time()
    library.write('df_big', df_big)
    e = time.time()
    samples.append((e-s))
print(f'Arctic DB write time {"{:.6f}".format(np.mean(samples))} seconds (avg over {ITERATIONS_N} iterations)')

Arctic DB write time 1.102033 seconds (over 3 iterations)


### Pandas .loc querying time

In [11]:
samples = []
for _itr in range(ITERATIONS_N):
    s = time.time()
    df_big.loc[(df_big['COL_0'] > 20)  &  (df_big['COL_10'] < 40) & (df_big['COL_15'] > 19) & (df_big['COL_30'] < 5), ['COL_11', 'COL_38']]
    e = time.time()
    samples.append((e-s))
print(f'pandas.loc filtering {"{:.6f}".format(np.mean(samples))} seconds (avg over {ITERATIONS_N} iterations)')

pandas.loc filtering 0.095933 seconds (over 3 iterations)


### Testing ArcticDB querying time

In [12]:
samples = []
for _itr in range(ITERATIONS_N):
    s = time.time()
    q = adb.QueryBuilder()
    q = q[(q["COL_0"] > 0) & (q["COL_10"] < 40) & (q["COL_15"] > 19) & (q["COL_30"] <5)]
    e = time.time()
    samples.append(e-s)
query_build_avg_t = np.mean(samples)

samples = []
for _itr in range(ITERATIONS_N):
    s = time.time()
    _temp = library.read('df_big', columns= ['COL_11', 'COL_38'], query_builder=q).data
    e = time.time()
    samples.append(e-s)
query_avg_t = np.mean(samples)
print(f'ArcticDB filtering {"{:.6f}".format(query_avg_t)} seconds (avg over {ITERATIONS_N} iterations) '
      f'(including building query : {"{:.6f}".format(query_avg_t + query_build_avg_t)})')

ArcticDB filtering 0.071078 seconds (over 3 iterations) (including building query : 0.071742)


### Testing time to read from disk

In [13]:
samples = []
for _itr in range(ITERATIONS_N):
    s = time.time()
    df_big_in = pd.read_csv("df_big.csv", index_col=0, parse_dates=True)
    e = time.time()
    samples.append(e-s)
print(f'pd.read_csv( )  time {"{:.6f}".format(np.mean(samples))} seconds (avg over {ITERATIONS_N} iterations)')

pd.read_csv( )  time 16.067021 seconds (over 3 iterations)


In [14]:
samples = []
for _itr in range(ITERATIONS_N):
    s = time.time()
    _temp = library.read("df_big")
    e = time.time()
    samples.append(e-s)
print(f'library.read()  time {"{:.6f}".format(np.mean(samples))} seconds (avg over {ITERATIONS_N} iterations)')

library.read()  time 0.324823 seconds (over 3 iterations)


#### Some cleaning up

In [15]:
if os.path.exists("df_big.csv"):
    os.remove("df_big.csv")
if ac.has_library('intro'):
    ac.delete_library('intro')