In [1]:
from langame import LangameClient
import numpy as np
from autofaiss import build_index
import os
import shutil
import numpy as np
from autofaiss import build_index
from sentence_transformers import SentenceTransformer
import torch
from langame.quality import is_garbage
from torch.utils.data import DataLoader
from tqdm import tqdm

import os
c = LangameClient(path_to_config_file="../config.yaml")

In [None]:
firestore = c._firestore_client
collection = firestore.collection("memes").limit(2000)
existing_conversation_starters = []

for e in collection.stream():
    if is_garbage(e.to_dict()):
        print(f"Skipping id: {e.id}, garbage")
        continue
    existing_conversation_starters.append({"id": e.id, **e.to_dict()})
print(
    f"Got {len(existing_conversation_starters)} existing conversation starters"
)
print("Preparing embeddings for existing conversation starters")
sentence_embeddings_model = None

sentence_embeddings_model_name = "sentence-transformers/LaBSE"
device = "cpu"

print(f"Loaded sentence embedding model, device: {device}")

sentence_embeddings_model = SentenceTransformer(
    sentence_embeddings_model_name, device=device
)

In [19]:
bar = tqdm()
embeddings = []
batch_size = 256
existing_conversation_starters_as_batch = [
    [e["content"] for e in existing_conversation_starters[i : i + batch_size]]
    for i in range(0, len(existing_conversation_starters), batch_size)
]
for batch in existing_conversation_starters_as_batch:
    emb = sentence_embeddings_model.encode(
        batch,
        show_progress_bar=False, device=device
    )

    # extends embeddings with batch
    embeddings.extend(emb)
    bar.update(1)
# flatten embeddings
embeddings = np.array(embeddings)
embeddings.shape

4it [04:00, 60.05s/it]
8it [00:23,  3.00s/it]

(1970, 768)

In [20]:
# delete "embeddings" and "indexes" folders
for folder in ["embeddings", "indexes"]:
    if os.path.exists(folder):
        shutil.rmtree(folder)

print("Saving embeddings to disk and building index to disk")
os.makedirs("embeddings", exist_ok=True)
np.save("embeddings/p1.npy", embeddings)
index, _ = build_index(
    "embeddings",
    index_path="indexes/knn.index",
    max_index_memory_usage="6G",
    current_memory_available="7G",
)

Saving embeddings to disk and building index to disk
Using 16 omp threads (processes), consider increasing --nb_cores if you have more
Launching the whole pipeline 01/31/2022, 14:22:17
There are 1970 embeddings of dim 768
	Compute estimated construction time of the index 01/31/2022, 14:22:17
		-> Train: 16.7 minutes
		-> Add: 0.0 seconds
		Total: 16.7 minutes
	>>> Finished "Compute estimated construction time of the index" in 0.0001 secs
	Checking that your have enough memory available to create the index 01/31/2022, 14:22:17
7.2MB of memory will be needed to build the index (more might be used if you have more)
	>>> Finished "Checking that your have enough memory available to create the index" in 0.0001 secs
	Selecting most promising index types given data characteristics 01/31/2022, 14:22:17
	>>> Finished "Selecting most promising index types given data characteristics" in 0.0000 secs
	Creating the index 01/31/2022, 14:22:17
		-> Instanciate the index HNSW15 01/31/2022, 14:22:17
		>>

100%|██████████| 1/1 [00:00<00:00, 467.59it/s]

		>>> Finished "-> Adding the vectors to the index" in 0.0489 secs
	>>> Finished "Creating the index" in 0.0506 secs
	Computing best hyperparameters 01/31/2022, 14:22:17





	>>> Finished "Computing best hyperparameters" in 13.3627 secs
The best hyperparameters are: efSearch=14840
	Compute fast metrics 01/31/2022, 14:22:30
1346
	>>> Finished "Compute fast metrics" in 10.0101 secs
	Saving the index on local disk 01/31/2022, 14:22:40
	>>> Finished "Saving the index on local disk" in 0.0036 secs
Recap:
{'99p_search_speed_ms': 30.27330986224115,
 'avg_search_speed_ms': 7.430445420677548,
 'compression ratio': 0.957607500296689,
 'index_key': 'HNSW15',
 'index_param': 'efSearch=14840',
 'nb vectors': 1970,
 'reconstruction error %': 0.0,
 'size in bytes': 6319750,
 'vectors dimension': 768}
>>> Finished "Launching the whole pipeline" in 23.4371 secs


In [24]:
query = sentence_embeddings_model.encode("intelligence", show_progress_bar=False)
E, I = index.search(np.array([query]), 20)
list(reversed([(existing_conversation_starters[i]["content"], E[0][idx_search]) for idx_search, i in enumerate(I[0])]))

[('If you could have been told something 10 years ago, what would you want to know?',
  0.29374543),
 ('Does mysticism play a role in the study of science ?', 0.29662097),
 ('Is it a possibility that the state become governed by an artificial intelligence?',
  0.29806253),
 ('Is it a possibility that the state become governed by an artificial intelligence?',
  0.29806253),
 ('"Describe Artificial Intelligence to me..."', 0.29933268),
 ('Do you think technology will ever be able to "understand" (or model?) the human mind?',
  0.3021911),
 ('Will artificial intelligence utilize self-awareness, in the form of consciousness?',
  0.30300403),
 ('What is the mind?', 0.3038898),
 ('How does knowledge affect our perception of the world?', 0.30761823),
 ('What do you know now that you wish you knew when you were younger?',
  0.3080268),
 ('Are robots really "more" intelligent than us? According to your understanding.',
  0.30879834),
 ('How much does human intelligence actually matter?', 0.3090