In [None]:
from huggingface_hub import hf_hub_download, snapshot_download
from tokenizers import Tokenizer
import ctranslate2
import numpy as np

In [None]:
# Download and initialize the tokenizer
tok_config = hf_hub_download("jncraton/gte-small-ct2-int8", 
                             "tokenizer.json")
tokenizer = Tokenizer.from_file(tok_config)
tokenizer.no_padding()

In [None]:
# Tokenize sample text
tokens = tokenizer.encode("Hello, world")
tokens.ids

In [None]:
# Download and initialize the encoder
path = snapshot_download("jncraton/gte-small-ct2-int8", 
                         max_workers=1)
encoder = ctranslate2.Encoder(path, "cpu", compute_type="int8")

In [None]:
# Run a forward pass to compute last_hidden_state for our tokens
outputs = encoder.forward_batch([tokens.ids])
np.array(outputs.last_hidden_state).shape

In [None]:
def embed(docs):
    """ Returns embeddings for list of documents """
    tokens = [tokenizer.encode(doc).ids for doc in docs]

    def mean_pool(last_hidden_state):
        embedding = np.mean(last_hidden_state, axis=0)
        embedding = embedding / np.linalg.norm(embedding)
        return embedding

    outputs = encoder.forward_batch(tokens)
    embeddings = [mean_pool(lhs) for lhs in np.array(outputs.last_hidden_state)]

    return embeddings

embed(["Hello, world"])[0].shape

In [None]:
def search(query, embeddings):
    """ Returns cosine similarity between query and embeddings """
    query_embedding = embed([query])[0]

    scores = np.dot(embeddings, query_embedding)

    return sorted(enumerate(scores), 
                  key=lambda x: x[1], reverse=True)


embeddings = embed(["Hello friend!", 
                    "Hello", 
                    "Get lost", 
                    "The capital of France is Paris"])

search("A nice greeting", embeddings)