A short sample notebook to compare the performance of pandas (1.3.4 was used) vs ArcticDB (4.5.0) in reading from /writing to local disk/query operations 

Sources:
https://docs.arcticdb.io/4.5.0/
https://medium.com/arcticdb/why-you-should-use-arcticdb-instead-of-csv-to-save-your-pandas-dataframes-ac4d06e55724

In [1]:
import os
import datetime as dt
import time

import numpy as np
import pandas as pd

import arcticdb as adb

In [2]:
# this will set up the storage using the local file system
uri = "lmdb://C:dev/tmp/arcticdb_intro"
ac = adb.Arctic(uri)

In [3]:
if ac.has_library('intro'):
    ac.delete_library('intro')

In [4]:
ac.create_library('intro')  # static schema - see note below
ac.list_libraries()

['intro']

In [5]:
library = ac['intro']

In [6]:
n_row = int(5e6)
cols = ['COL_%d' % i for i in range(50)]
df_big = pd.DataFrame(np.random.randint(0, 50, size=(n_row, 50)), columns=cols)
df_big.index = pd.date_range(dt.datetime(1970, 1, 1, 5), periods=n_row, freq="S")

In [7]:
df_big.tail(4)

Unnamed: 0,COL_0,COL_1,COL_2,COL_3,COL_4,COL_5,COL_6,COL_7,COL_8,COL_9,...,COL_40,COL_41,COL_42,COL_43,COL_44,COL_45,COL_46,COL_47,COL_48,COL_49
1970-02-28 01:53:16,26,27,15,38,47,25,18,43,24,41,...,22,24,18,28,20,36,2,43,41,18
1970-02-28 01:53:17,13,4,33,39,9,36,33,37,30,41,...,0,49,11,9,16,30,2,10,4,4
1970-02-28 01:53:18,31,46,6,39,13,16,9,25,5,46,...,31,34,4,41,22,2,31,41,49,37
1970-02-28 01:53:19,19,22,21,16,32,25,44,48,12,40,...,43,13,24,0,41,15,43,47,33,10


In [15]:
df_big.shape

(5000000, 50)

### Testing output time to C: drive

In [8]:
s = time.time()
df_big.to_csv("df_big.csv")
e = time.time()
print(f'pd.to_csv( )  time  {"{:.6f}".format((e-s))} seconds')

pd.to_csv( )  time  32.449820 seconds


In [9]:
if library.has_symbol('df_big'):
    library.delete('df_big')
s = time.time()
library.write('df_big', df_big)
e = time.time()
print(f'Arctic DB write time {"{:.6f}".format((e-s))} seconds')

Arctic DB write time 1.124194 seconds


### Pandas .loc querying time

In [10]:
s = time.time()
df_big.loc[(df_big['COL_0'] > 20)  &  (df_big['COL_10'] < 40) & (df_big['COL_15'] > 19) & (df_big['COL_30'] < 5), ['COL_11', 'COL_38']]
e = time.time()
print(f'pandas.loc filtering {"{:.6f}".format(e-s)} seconds')

pandas.loc filtering 0.101297 seconds


### Testing ArcticDB querying time

In [11]:
s = time.time()
q = adb.QueryBuilder()
q = q[(q["COL_0"] > 0) & (q["COL_10"] < 40) & (q["COL_15"] > 19) & (q["COL_30"] <5)]
e = time.time()
building_query_time = e-s

s = time.time()
_temp = library.read('df_big', columns= ['COL_11', 'COL_38'], query_builder=q).data
e = time.time()
adb_filter_time = (e-s)
print(f'ArcticDB filtering {"{:.6f}".format(e-s)} seconds (including building query : {"{:.6f}".format(building_query_time + adb_filter_time)})')

ArcticDB filtering 0.068600 seconds (including building query : 0.069597)


### Testing time to read from disk

In [12]:
s = time.time()
df_big_in = pd.read_csv("df_big.csv", index_col=0, parse_dates=True)
e = time.time()
print(f'pd.read_csv( )  time {"{:.6f}".format(e-s)} seconds')

pd.read_csv( )  time 15.256523 seconds


In [13]:
s = time.time()
_temp = library.read("df_big")
e = time.time()
print(f'library.read()  time {"{:.6f}".format(e-s)} seconds')

library.read()  time 0.139361 seconds


#### Some cleaning up

In [14]:
if os.path.exists("df_big.csv"):
    os.remove("df_big.csv")
if ac.has_library('intro'):
    ac.delete_library('intro')