# Word Embeddings in CouchDB

This example uses the official CouchDB Connector within Python3 to store and retrieve various amounts of Word Embeddings.

We will use a local CouchDB running on localhost:5984

In [1]:
import couchdb
import time
import numpy
import plotly
import pickle
from tqdm import tqdm_notebook as tqdm

# Dummy Embeddings

For testing purposes we will use randomly generated numpy arrays as dummy embbeddings.

In [2]:
def embeddings(n=1000, dim=300):
    """
    Yield n tuples of random numpy arrays of *dim* length indexed by *n*
    """
    idx = 0
    while idx < n:
        yield (str(idx), numpy.random.rand(dim))
        idx += 1

# Conversion Functions

Since we can't just save a NumPy array into the database, we will convert it into apython list.
CouchDB does not support bson

In [4]:
def adapt_array(array):
    """
    :param numpy.array array: NumPy array
    :return: NumPy array as python list
    :rtype: list
    """
    
    return array.tolist()


def convert_array(blob):
    """
    convert the list version of the array back into a numpy array.

    :param BLOG blob: BLOB containing a NumPy array
    :return: One steaming hot NumPy array
    :rtype: numpy.array
    """

    return numpy.array(blob)

In [6]:
uri = 'http://admin:admin@localhost:5984/'
dbname = 'embeddings'
couchserver = couchdb.Server(uri)

if dbname in couchserver:
    db = couchserver[dbname]
else:
    db = couchserver.create(dbname)

In [7]:
%%time
for key, emb in embeddings():
    arr = adapt_array(emb)
    obj = {'key': key, 'emb': arr}
    db[key] = obj

CPU times: user 3.91 s, sys: 253 ms, total: 4.16 s
Wall time: 1min 51s


In [8]:
%%time
for key, _ in embeddings():
    obj = db[key]
    emb = convert_array(obj['emb'])
    assert(type(emb) is numpy.ndarray)

CPU times: user 2 s, sys: 111 ms, total: 2.11 s
Wall time: 8.25 s


# Sample some data

To test the I/O we will write and read some data from the database. This may take a while.

In [11]:
write_times = []
read_times = []
counts = [500, 1000, 2000, 3000, 4000, 5000]

for c in counts:
    del couchserver[dbname]
    db = couchserver.create(dbname)
    
    start_time_write = time.time()
    for key, emb in tqdm(embeddings(c), total=c):
        arr = adapt_array(emb)
        obj = {'key': key, 'emb': arr}
        db[key] = obj
    write_times.append(time.time() - start_time_write)
    
    start_time_read = time.time()
    for key, _ in embeddings(c):
        obj = db[key]
        emb = convert_array(obj['emb'])
        assert(type(emb) is numpy.ndarray)
    read_times.append(time.time() - start_time_read)
    
print('DONE')

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))


DONE


In [12]:
# save times for later plotting

with open('./collected_times/couchdb-write-times.pickle', 'wb') as f:
    pickle.dump(write_times, f)
    
with open('./collected_times/couchdb-read-times.pickle', 'wb') as f:
    pickle.dump(read_times, f)

# Results

In [13]:
plotly.offline.init_notebook_mode(connected=True)
trace = plotly.graph_objs.Scatter(
    y = write_times,
    x = counts,
    mode = 'lines+markers'
)
layout = plotly.graph_objs.Layout(title="CouchDB Write Times",
                yaxis=dict(title='Time in Seconds'),
                xaxis=dict(title='Embedding Count'))
data = [trace]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-scatter-write')

In [14]:
plotly.offline.init_notebook_mode(connected=True)
trace = plotly.graph_objs.Scatter(
    y = read_times,
    x = counts,
    mode = 'lines+markers'
)
layout = plotly.graph_objs.Layout(title="CouchDB Read Times",
                yaxis=dict(title='Time in Seconds'),
                xaxis=dict(title='Embedding Count'))
data = [trace]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-scatter-read')