In [1]:
from langame.langame_client import LangameClient
from langame.arrays import get_prompt
import openai
import fire
from transformers import T5Tokenizer, T5ForConditionalGeneration
from langame.strings import string_similarity
import torch
import json
import datetime
import faiss
import glob
from sentence_transformers import SentenceTransformer
import numpy as np
import logging
from autofaiss.external.quantize import Quantizer
import os


In [35]:

def connected_components(neighbors):
    seen = set()

    def component(node):
        r = []
        nodes = set([node])
        while nodes:
            node = nodes.pop()
            seen.add(node)
            nodes |= set(neighbors[node]) - seen
            r.append(node)
        return r

    u = []
    for node in list(neighbors):
        if node not in seen:
            u.append(component(node))
    return u


def get_uniques(index, embeddings, max_duplicates_per_item=10, threshold=0.9):
    D, I = index.search(embeddings, k=max_duplicates_per_item)
    from collections import defaultdict

    same_mapping = defaultdict(list)

    for i, (d, r_i) in enumerate(zip(D, I)):
        for dd, rr in zip(d, r_i):
            if dd > threshold:
                same_mapping[int(i)].append(int(rr))

    groups = connected_components(same_mapping)
    uniques = []
    for g in groups:
        uniques.append(g[0])

    return uniques

In [32]:
in_file="../data/ice_breaker_2021_11_28.txt"
out_file="../data/ice_breaker_2021_11_28_dedup.txt"
assert isinstance(in_file, str), "out_file must be a string"
assert isinstance(out_file, str), "out_file must be a string"
assert out_file.endswith(".txt"), "out_file must be a .txt file"
logger = logging.getLogger("dedup")
logging.basicConfig(level=logging.INFO)
logger.warning(
    "Assuming the input dataset rows are of format [topics,] ### [sentence]"
)

# Add the date to the out file before the extension with format YYYY_MM_DD
out_file = out_file.replace(
    ".txt", f"_{datetime.datetime.now().strftime('%Y_%m_%d')}.txt"
)

logger.info(f"Deduplicating dataset from {in_file}, writing to {out_file}")

# Skip the meme generated under this similarity threshold
SIMILARITY_THRESHOLD = 0.8

sentence_embeddings_model_name = "sentence-transformers/LaBSE"
device = "cuda:0"

logger.info(f"Device: {device}")

sentence_embeddings_model = SentenceTransformer(sentence_embeddings_model_name).to(
    device
)

try:
    c = LangameClient(path_to_config_file="../config.yaml")
except:
    pass
existing_memes = []
for e in c._firestore_client.collection("memes").stream():
    existing_memes.append((e.id, e.to_dict()))

logger.info(f"Building embeddings for existing memes")

existing_memes_embeddings = np.array([
    sentence_embeddings_model.encode(e[1]["content"], show_progress_bar=False)
    for e in existing_memes
])

new_memes = []
new_memes_embeddings = []


with open(in_file, "r") as f:
    logger.info(f"Done, now building embeddings for new memes")
    for i, line in enumerate(f):
        if i % 1000 == 0:
            logger.info(f"Built embeddings for {i} new memes")
        line = line.strip()
        # Check that the format is correct
        # i.e. [topics,] ### [sentence]
        splitted = line.split("###")
        if line and len(splitted) == 2:
            topics, sentence = splitted
            new_memes.append(line)
            new_memes_embeddings.append(
                sentence_embeddings_model.encode(
                    sentence.strip(), show_progress_bar=False
                )
            )
new_memes_embeddings = np.array(new_memes_embeddings)

INFO:dedup:Deduplicating dataset from ../data/ice_breaker_2021_11_28.txt, writing to ../data/ice_breaker_2021_11_28_dedup_2021_12_03.txt
INFO:dedup:Device: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/LaBSE
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0
INFO:dedup:Building embeddings for existing memes
INFO:dedup:Done, now building embeddings for new memes
INFO:dedup:Built embeddings for 0 new memes
INFO:dedup:Built embeddings for 1000 new memes
INFO:dedup:Built embeddings for 2000 new memes
INFO:dedup:Built embeddings for 3000 new memes
INFO:dedup:Built embeddings for 4000 new memes
INFO:dedup:Built embeddings for 5000 new memes
INFO:dedup:Built embeddings for 6000 new memes
INFO:dedup:Built embeddings for 7000 new memes
INFO:dedup:Built embeddings for 8000 new memes
INFO:dedup:Built embeddings for 9000 n

In [33]:
# Create the faiss indexes
existing_memes_embeddings_dir = "embeddings/existing_memes_embeddings"
new_memes_embeddings_dir = "embeddings/new_memes_embeddings"
existing_memes_index_dir = "embeddings/existing_memes_indexes"
new_memes_index_dir = "embeddings/new_memes_indexes"
os.makedirs(existing_memes_embeddings_dir, exist_ok=True)
os.makedirs(new_memes_embeddings_dir, exist_ok=True)
np.save(f"{existing_memes_embeddings_dir}/p1.npy", existing_memes_embeddings)
np.save(f"{new_memes_embeddings_dir}/p1.npy", new_memes_embeddings)
quantizer = Quantizer()
logger.info(f"Done, now building indexes")

quantizer.quantize(
    embeddings_path=existing_memes_embeddings_dir,
    output_path=existing_memes_index_dir,
    max_index_memory_usage="6G",
    current_memory_available="8G",
)
quantizer.quantize(
    embeddings_path=new_memes_embeddings_dir,
    output_path=new_memes_index_dir,
    max_index_memory_usage="6G",
    current_memory_available="8G",
)
existing_memes_index = faiss.read_index(
    glob.glob(f"{existing_memes_index_dir}/*.index")[0]
)
new_memes_index = faiss.read_index(glob.glob(f"{new_memes_index_dir}/*.index")[0])

INFO:dedup:Done, now building indexes


Using 16 omp threads (processes), consider increasing --nb_cores if you have more
Launching the whole pipeline 12/03/2021, 17:04:51
There are 707 embeddings of dim 768
	Compute estimated construction time of the index 12/03/2021, 17:04:51
		-> Train: 16.7 minutes
		-> Add: 0.0 seconds
		Total: 16.7 minutes
	>>> Finished "Compute estimated construction time of the index" in 0.0000 secs
	Checking that your have enough memory available to create the index 12/03/2021, 17:04:51
2.5MB of memory will be needed to build the index (more might be used if you have more)
	>>> Finished "Checking that your have enough memory available to create the index" in 0.0000 secs
	Selecting most promising index types given data characteristics 12/03/2021, 17:04:51
	>>> Finished "Selecting most promising index types given data characteristics" in 0.0000 secs
	Creating the index 12/03/2021, 17:04:51
		-> Instanciate the index Flat 12/03/2021, 17:04:51
		>>> Finished "-> Instanciate the index Flat" in 0.0011 sec

100%|██████████| 1/1 [00:00<00:00, 846.31it/s]

		>>> Finished "-> Adding the vectors to the index" in 0.0027 secs
	>>> Finished "Creating the index" in 0.0044 secs
	Computing best hyperparameters 12/03/2021, 17:04:51
	>>> Finished "Computing best hyperparameters" in 0.0000 secs
The best hyperparameters are: 
	Saving the index on local disk 12/03/2021, 17:04:51
	>>> Finished "Saving the index on local disk" in 0.0011 secs
	Compute fast metrics 12/03/2021, 17:04:51





2121
	>>> Finished "Compute fast metrics" in 1.4466 secs
Recap:
{'99p_search_speed_ms': 10.506928399718166,
 'avg_search_speed_ms': 0.6796234412983204,
 'compression ratio': 0.9999792812814665,
 'nb vectors': 707,
 'reconstruction error %': 0.0,
 'size in bytes': 2171949,
 'vectors dimension': 768}
>>> Finished "Launching the whole pipeline" in 1.4555 secs
Using 16 omp threads (processes), consider increasing --nb_cores if you have more
Launching the whole pipeline 12/03/2021, 17:04:52
There are 14444 embeddings of dim 768
	Compute estimated construction time of the index 12/03/2021, 17:04:52
		-> Train: 16.7 minutes
		-> Add: 0.1 seconds
		Total: 16.7 minutes
	>>> Finished "Compute estimated construction time of the index" in 0.0001 secs
	Checking that your have enough memory available to create the index 12/03/2021, 17:04:52
55.0MB of memory will be needed to build the index (more might be used if you have more)
	>>> Finished "Checking that your have enough memory available to create

100%|██████████| 1/1 [00:00<00:00, 114.06it/s]


		>>> Finished "-> Adding the vectors to the index" in 0.8944 secs
	>>> Finished "Creating the index" in 0.8960 secs
	Computing best hyperparameters 12/03/2021, 17:04:53
	>>> Finished "Computing best hyperparameters" in 16.2953 secs
The best hyperparameters are: efSearch=1296
	Saving the index on local disk 12/03/2021, 17:05:10
	>>> Finished "Saving the index on local disk" in 0.0212 secs
	Compute fast metrics 12/03/2021, 17:05:10
931
	>>> Finished "Compute fast metrics" in 10.0451 secs
Recap:
{'99p_search_speed_ms': 25.904212699924784,
 'avg_search_speed_ms': 10.749063450054127,
 'compression ratio': 0.9186130828799025,
 'nb vectors': 14444,
 'reconstruction error %': 0.0,
 'size in bytes': 48303218,
 'vectors dimension': 768}
>>> Finished "Launching the whole pipeline" in 27.2628 secs


In [36]:
logger.info("Done, now filtering duplicates")

# Get the new uniques
new_uniques = get_uniques(
    new_memes_index,
    new_memes_embeddings,
    max_duplicates_per_item=10,
    threshold=SIMILARITY_THRESHOLD,
)
new_uniques_embeddings = new_memes_embeddings[new_uniques]
# Get the new NEW uniques
new_new_uniques = get_uniques(
    existing_memes_index,
    new_uniques_embeddings,
    max_duplicates_per_item=10,
    threshold=SIMILARITY_THRESHOLD,
)

INFO:dedup:Done, now filtering duplicates


In [37]:
new_new_uniques

[6,
 12,
 13,
 14,
 17,
 21,
 25,
 32,
 34,
 40,
 47,
 53,
 56,
 58,
 62,
 69,
 71,
 89,
 97,
 104,
 108,
 110,
 111,
 118,
 135,
 136,
 138,
 146,
 147,
 148,
 179,
 182,
 190,
 196,
 198,
 201,
 216,
 225,
 248,
 250,
 252,
 259,
 274,
 279,
 287,
 288,
 296,
 300,
 311,
 314,
 318,
 345,
 347,
 359,
 361,
 368,
 369,
 372,
 374,
 384,
 419,
 421,
 438,
 441,
 448,
 468,
 480,
 484,
 496,
 502,
 504,
 506,
 513,
 518,
 524,
 533,
 539,
 547,
 554,
 565,
 570,
 591,
 599,
 600,
 602,
 614,
 617,
 623,
 631,
 633,
 654,
 655,
 669,
 694,
 696,
 715,
 731,
 737,
 741,
 751,
 753,
 757,
 761,
 762,
 763,
 776,
 782,
 783,
 798,
 807,
 808,
 814,
 837,
 843,
 848,
 852,
 855,
 861,
 862,
 863,
 866,
 867,
 871,
 872,
 873,
 895,
 906,
 910,
 912,
 916,
 918,
 920,
 931,
 954,
 958,
 967,
 977,
 985,
 990,
 1038,
 1040,
 1055,
 1068,
 1069,
 1075,
 1082,
 1083,
 1137,
 1143,
 1147,
 1150,
 1162,
 1166,
 1174,
 1178,
 1180,
 1182,
 1195,
 1196,
 1204,
 1205,
 1210,
 1230,
 1231,
 1244,
 125

In [10]:
indexes = new_memes_index.search(sentence_embeddings_model.encode(["What is the purpose of life"], show_progress_bar=False), k=10)[1]
for e in indexes[0]:
    print(new_memes[e])

philosophy,ice breaker ### What is the meaning of life?
philosophy,ice breaker ### What is the meaning of life?
philosophy ### What is the meaning of life ?
philosophy,ice breaker ### What is the meaning of life?
life,philosophy,mind ### What is the meaning of life?
philosophy,ice breaker ### What is the meaning of life?
philosophy,ice breaker ### What is the meaning of life?
life,ice breaker ### What is your life goal?
philosophy,ice breaker ### What are the meaning of life?
ice breaker ### What is a life goal of yours?


In [39]:
# Write to out file
with open(out_file, "w") as f:
    logger.info(f"Done, now writing to {out_file}")
    for new_unique in new_new_uniques:
        f.write(new_memes[new_unique] + "\n")
logger.info(f"Deduplicated {len(new_memes)} into {len(new_new_uniques)} new memes, wrote to {out_file}")

INFO:dedup:Done, now writing to ../data/ice_breaker_2021_11_28_dedup_2021_12_03.txt
INFO:dedup:Deduplicated 14444 into 883 new memes, wrote to ../data/ice_breaker_2021_11_28_dedup_2021_12_03.txt
