# Word Embeddings in LevelDB

This example uses the [plyvel](https://plyvel.readthedocs.io/en/latest/) Connector within Python3 to store and retrieve various amounts of Word Embeddings.

Check [install-instructions](https://computingforgeeks.com/how-to-install-leveldb-on-ubuntu-18-04-ubuntu-16-04/) to see how to install LevelDB on linux

In [2]:
import plyvel
import subprocess
import shutil
import io
import time
import pickle
import numpy
import plotly
from tqdm import tqdm_notebook as tqdm

# Dummy Embeddings

For testing purposes we will use randomly generated numpy arrays as dummy embbeddings.

In [3]:
def embeddings(n=1000, dim=512):
    """
    Yield n tuples of random numpy arrays of *dim* length indexed by *n*
    """
    idx = 0
    while idx < n:
        yield (str(idx), numpy.random.rand(dim))
        idx += 1

# Conversion Functions

Since we can't just save a NumPy array into the database, we will convert it into a BLOB.

In [4]:
def adapt_array(array):
    """
    Using the numpy.save function to save a binary version of the array,
    and BytesIO to catch the stream of data and convert it into a BLOB.
    """
    out = io.BytesIO()
    numpy.save(out, array)
    out.seek(0)

    return out.read()

def convert_array(blob):
    """
    Using BytesIO to convert the binary version of the array back into a numpy array.
    """
    out = io.BytesIO(blob)
    out.seek(0)

    return numpy.load(out)

In [5]:
connection = plyvel.DB('./leveldb.embedding.db', create_if_missing=True)

In [6]:
%%time
for key, emb in embeddings():
    arr = adapt_array(emb)
    connection.put(key.encode(), arr)

CPU times: user 159 ms, sys: 4.81 ms, total: 164 ms
Wall time: 163 ms


In [7]:
%%time
for key, _ in embeddings():
    arr = connection.get(key.encode())
    emb = convert_array(arr)
    assert(type(emb) is numpy.ndarray)

CPU times: user 196 ms, sys: 0 ns, total: 196 ms
Wall time: 195 ms


In [8]:
connection.close()

# Sample some data

To test the I/O we will write and read some data from the database. This may take a while.

In [9]:
write_times = []
read_times = []
db_sizes = []
counts = [500, 1000, 2000, 3000, 4000, 5000, 50000, 100000, 1000000, 10000000]

for c in counts:
    shutil.rmtree('./leveldb.embedding.db', ignore_errors=True)
    connection = plyvel.DB('./leveldb.embedding.db', create_if_missing=True)
    
    start_time_write = time.time()
    for key, emb in tqdm(embeddings(c), total=c):
        arr = adapt_array(emb)
        connection.put(key.encode(), arr)
    write_times.append(time.time() - start_time_write)
    
    start_time_read = time.time()
    for key, _ in embeddings(c):
        arr = connection.get(key.encode())
        emb = convert_array(arr)
        assert(type(emb) is numpy.ndarray)
    read_times.append(time.time() - start_time_read)

    connection.close()
    db_sizes.append(subprocess.check_output(['du','-s', './leveldb.embedding.db']).split()[0].decode('utf-8'))
    
print('DONE')

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000000), HTML(value='')))


DONE


In [10]:
# save times for later plotting

with open('./collected_times/leveldb-write-times.pickle', 'wb') as f:
    pickle.dump(write_times, f)
    
with open('./collected_times/leveldb-read-times.pickle', 'wb') as f:
    pickle.dump(read_times, f)

# Results

In [11]:
plotly.offline.init_notebook_mode(connected=True)
trace = plotly.graph_objs.Scatter(
    y = write_times,
    x = counts,
    mode = 'lines+markers'
)
layout = plotly.graph_objs.Layout(title="LevelDB Write Times",
                yaxis=dict(title='Time in Seconds'),
                xaxis=dict(title='Embedding Count'))
data = [trace]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-scatter-write')

In [12]:
plotly.offline.init_notebook_mode(connected=True)
trace = plotly.graph_objs.Scatter(
    y = read_times,
    x = counts,
    mode = 'lines+markers'
)
layout = plotly.graph_objs.Layout(title="LevelDB Read Times",
                yaxis=dict(title='Time in Seconds'),
                xaxis=dict(title='Embedding Count'))
data = [trace]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-scatter-read')

In [13]:
# DB Size
plotly.offline.init_notebook_mode(connected=True)
trace = plotly.graph_objs.Bar(
    x = counts,
    y = db_sizes
)
layout = plotly.graph_objs.Layout(title="Database Size",
                yaxis=dict(title='Size in KB'),
                xaxis=dict(title='Embedding Count'))
data = [trace]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-bar-size')

In [14]:
print(db_sizes)

['2092', '4188', '8360', '12528', '16700', '20872', '208588', '417184', '4179436', '41808260']
