First few cells are just starting a local mongo instance
===

In [None]:
!mongod --version

In [None]:
# kill mongo if it's running (makes this notebook easily re-runnable)
# however beware if you are running mongo for other reasons!
!ps x | grep mongod | grep -v grep | awk '{print $1}' | xargs kill

In [None]:
!cat ./mongo.sh

In [None]:
!./mongo.sh

In [None]:
!tail -10 data/mongo.log

In [None]:
import time

import arctic
import numpy as np
import pandas as pd

Create an example dataframe
===

In [None]:
days = 365*10
n_securities = 3000
df_wide = pd.DataFrame(data=np.random.rand(days, n_securities), index=pd.date_range('2000', periods=days))
df_wide.columns = ['security_{}'.format(i) for i in range(1, n_securities+1)]
df_wide.head()

Using VersionStore to read/write wide data
===

In [None]:
db = arctic.Arctic("localhost")
print("Libraries: {}".format(db.list_libraries()))
db.initialize_library('libvs1', lib_type='VersionStore')
libvs1 = db['libvs1']
print("Symbols in {}: {}".format('libvs1', libvs1.list_symbols()))

In [None]:
def get_size(lib):
    ''' helper to get size of an arctic library in mongo '''
    byts = 0.
    for c in lib._arctic_lib._library_coll.database.collection_names():
        if lib._collection.name in c:
            byts += lib._arctic_lib._library_coll.database.command('collstats', c)['storageSize']
    return '{} megabytes'.format(byts / 1e6)

In [None]:
size_before = get_size(libvs1)
%time libvs1.write('wide_item1', df_wide)
size_after = get_size(libvs1)

print('')
print('Size before: {}'.format(size_before))
print('Size after: {}'.format(size_after))
print('Symbols in {}: {}'.format('libvs1', libvs1.list_symbols()))

note: that's way more space efficient than I'd expect.  I will dig a bit deeper when I have a moment... possibly I messed up the calc

In [None]:
#note: versionstore wraps results in a class, .data gets access to the object we want (DataFrame in this case)
%time rb_wide = libvs1.read('wide_item1').data

In [None]:
rb_wide.head()

In [None]:
np.all(rb_wide == df_wide)

In [None]:
df_wide.info()

In [None]:
rb_wide.info()

In [None]:
# note: rb_wide's index has lost metadata about it's frequency
# but in this instance it can be inferred. would still
# be good to change arctic to keep this..
rb_wide.index.inferred_freq

Using ChunkStore to read/write tall data
===

In [None]:
# reshape wide to tall
%time df_tall = df_wide.stack().reset_index().rename(columns={'level_0': 'date', 'level_1': 'security_id', 0: 'vals'})

df_tall.head()

In [None]:
db.initialize_library('libcs1', lib_type='ChunkStoreV1')
libcs = db['libcs1']
print('Symbols in {}: {}'.format('libcs1', libcs.list_symbols()))

In [None]:
size_before = get_size(libcs)
%time libcs.write('tall_item1', df_tall, chunk_size='A')
size_after = get_size(libcs)

print('')
print('Size before: {}'.format(size_before))
print('Size after: {}'.format(size_after))
print('Symbols in {}: {}'.format('libcs1', libcs.list_symbols()))

In [None]:
%time rb_tall = libcs.read('tall_item1')

rb_tall.head()

In [None]:
np.all(df_tall == rb_tall)

In [None]:
df_tall.info()

In [None]:
rb_tall.info()

Writing tall data to VersionStore (perf ok, tall is less space efficient than wide format for VersionStore)
===

In [None]:
get_size(libvs1)

In [None]:
%time libvs1.write('tall_item1', df_tall)

In [None]:
%time rb_vs_tall = libvs1.read('tall_item1')

In [None]:
np.all(df_tall == rb_vs_tall.data)

In [None]:
get_size(libvs1)

Writing wide data to ChunkStore (perf bad, not great on space either)
===

In [None]:
get_size(libcs)

In [None]:
df_wide.index.name = 'date'  # chunkstore's date chunker is picky about having an index or column called 'date'

In [None]:
%time libcs.write('wide_item1', df_wide, chunk_size='A')

In [None]:
%time rb_cs_wide = libcs.read('wide_item1')

In [None]:
np.all(df_wide == rb_cs_wide)

In [None]:
get_size(libcs)