In [None]:
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement

In [None]:
# we are connecting to a locally-running C* with the alpha vector indexing
# (and we use the existing table and its index)
keyspace = 'demo'
table = 'youtube_transcriptions'
cluster = Cluster()
session = cluster.connect()

In [None]:
import openai
embed_model = "text-embedding-ada-002"

def embedding_of_sentence(stc):
    res = openai.Embedding.create(
        input=[stc],
        engine=embed_model
    )
    return res['data'][0]['embedding']

In [None]:
# vectorize an input sentence
input_sentence = 'What is the average number of epochs needed to train a fast-forward network?'
input_embedding = embedding_of_sentence(input_sentence)
print(type(input_embedding), len(input_embedding))

In [None]:
# let's look at the matches
searcher = SimpleStatement(
    f"SELECT id, embedding, text FROM {keyspace}.{table} WHERE embedding ANN OF %s LIMIT %s"
)

In [None]:
top_k = 40
q_vec = input_embedding
closests = list(session.execute(searcher, (q_vec, top_k)))

In [None]:
# aaa
ie = embedding_of_sentence('How many eyes do spiders have?')
ie

In [None]:
top_k = 40
q_vec = input_embedding
closests = list(session.execute(searcher, (ie, top_k)))

In [None]:
# let's check the scalar products for instance (norm, distances)
import numpy as np

def emb_scal(e1, e2):
    # inefficient, we numpyify all the time
    return np.dot(
        np.array(e1, dtype=float),
        np.array(e2, dtype=float),
    )


def emb_scals(e1s, e2):
    # e1s a list of embedding vectors (lists)
    return np.dot(
        np.array(e1s, dtype=float),
        np.array(e2, dtype=float),
    )

In [None]:
print('Norms:')
print('|query|^2 = %f' % emb_scal(q_vec, q_vec))
for i, r in enumerate(closests):
    print('|result[%i]|^2 = %f' % (i, emb_scal(r.embedding, r.embedding)))

print('\nScalprods with query:')
for i, r in enumerate(closests):
    print('query . result[%i] = %f' % (i, emb_scal(q_vec, r.embedding)))


In [None]:
# the ANNs come in no particular order.
# It looks like we fetch some, calculate norms, sort and keep the best if better than threshold

In [None]:
list(emb_scals([r.embedding for r in closests[:3]], q_vec))