# Experiments in using embeddinghub to index embeddings

tl;dr nmslib does not currently support [incremental
updates](https://github.com/nmslib/nmslib/issues/73) which makes it difficult
to use for my scenarios. So I'm now investigating
[embeddinghub](https://github.com/featureform/embeddinghub) instead

In [1]:
import embeddinghub as eh

from sentence_transformers import SentenceTransformer, util

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]
embeddings = model.encode(corpus, convert_to_tensor=True).cpu()

### Add embeddings to embeddinghub local instance

In [3]:
hub = eh.connect(eh.Config(host="0.0.0.0", port=7462))
space = hub.create_space("kb", 384)
for i, doc in enumerate(corpus):
    space.set(doc, embeddings[i].tolist())

### Query embedding hub

In [None]:
queries = ['A man is eating pasta.', 
    'Someone in a gorilla costume is playing a set of drums.', 
    'A cheetah chases prey on across a field.']

for query in queries:
    query_embedding = model.encode(query, convert_to_tensor=True).cpu()
    e = eh.embedding_store_pb2.Embedding()
    e.values[:] = query_embedding.tolist()
    results = space.nearest_neighbors(5, vector=e)
    print(f"QUERY: {query}")
    for result in results:
        print(result)

### Add new entries to corpus

In [None]:
new_corpus_entries = [
    "A man is eating spaghetti",
    "A monkey is playing timpani"
]
corpus += new_corpus_entries
incremental_embeddings = model.encode(new_corpus_entries, convert_to_tensor=True)
for i, doc in enumerate(new_corpus_entries):
    space.set(doc, incremental_embeddings[i].tolist())


In [None]:
# Query against new entries
queries = ['A man is eating pasta.', 
    'Someone in a gorilla costume is playing a set of drums.', 
    'A cheetah chases prey on across a field.']

for query in queries:
    query_embedding = model.encode(query, convert_to_tensor=True).cpu()
    e = eh.embedding_store_pb2.Embedding()
    e.values[:] = query_embedding.tolist()
    results = space.nearest_neighbors(5, vector=e)
    print(f"QUERY: {query}")
    for result in results:
        print(result)

In [4]:
hub = eh.connect(eh.Config(host="0.0.0.0", port=7462))

AttributeError: 'EmbeddingHub' object has no attribute 'save'

In [None]:
space = hub.get_space("kb")
