# Word Embeddings in RocksDB

In [22]:
import numpy
import io
import time
import plotly
import pickle
from tqdm import tqdm_notebook as tqdm
import threading
import rocksdb

Dummy Embeddings

For testing purposes we will use randomly generated numpy arrays as dummy embbeddings.

In [23]:
def embeddings(n=1000, dim=512):
    """
    Yield n tuples of random numpy arrays of *dim* length indexed by *n*
    """
    idx = 0
    while idx < n:
        yield (str(idx), numpy.random.rand(dim))
        idx += 1

# Conversion Functions

Since we can't just save a NumPy array into the database, we will convert it into a BLOB.

In [24]:
def adapt_array(array):
    """
    Using the numpy.save function to save a binary version of the array,
    and BytesIO to catch the stream of data and convert it into a BLOB.
    """
    out = io.BytesIO()
    numpy.save(out, array)
    out.seek(0)

    return out.read()


def convert_array(blob):
    """
    Using BytesIO to convert the binary version of the array back into a numpy array.
    """
    out = io.BytesIO(blob)
    out.seek(0)

    return numpy.load(out)

In [25]:
db = rocksdb.DB("embeddings.db", rocksdb.Options())

RocksIOError: b'IO error: While lock file: embeddings.db/LOCK: No locks available'

In [26]:
write_times = []
read_times = []
db_sizes = []
counts = [500, 1000, 2000, 3000, 4000, 5000, 50000, 100000, 1000000, 10000000]

for c in counts:
    
    start_time_write = time.time()
    for key, emb in tqdm(embeddings(c), total=c):
        arr = adapt_array(emb)
        db.put(key.encode(), arr)
    write_times.append(time.time() - start_time_write)
    
    start_time_read = time.time()
    for key, _ in embeddings(c):
        arr = db.get(key.encode())
        emb = convert_array(arr)
        assert(type(emb) is numpy.ndarray)
    read_times.append(time.time() - start_time_read)
    
print('DONE')

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000000), HTML(value='')))


DONE


In [27]:
# save times for later plotting

with open('./collected_times/rocksdb-write-times.pickle', 'wb') as f:
    pickle.dump(write_times, f)
    
with open('./collected_times/rocksdb-read-times.pickle', 'wb') as f:
    pickle.dump(read_times, f)

In [28]:
# Write Times
plotly.offline.init_notebook_mode(connected=True)
trace = plotly.graph_objs.Scatter(
    y = write_times,
    x = counts,
    mode = 'lines+markers'
)
layout = plotly.graph_objs.Layout(title="RocksDB Write Times",
                xaxis=dict(title='Time in Seconds'),
                yaxis=dict(title='Embedding Count'))
data = [trace]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-basic-scatter')

In [29]:
plotly.offline.init_notebook_mode(connected=True)
trace = plotly.graph_objs.Scatter(
    y = read_times,
    x = counts,
    mode = 'lines+markers'
)
layout = plotly.graph_objs.Layout(title="Cassandra Read Times",
                yaxis=dict(title='Time in Seconds'),
                xaxis=dict(title='Embedding Count'))
data = [trace]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-scatter-read')