In [4]:
! pip install tqdm

Collecting tqdm
  Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tqdm-4.66.1-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.66.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [5]:
import math
import os
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import pickle
import random
import math
import time
import pprint
from constants import DIST_METRIC_INDEX

In [6]:
def init_memmap_embs(
    embs_memory_loc: str, dataset_size: int, emd_size: int = 512, dtype: str = "float32"
) -> np.memmap:
    """
    Initializes a memory-mapped NumPy array to read embeddings of examples.

    Args:
        embs_memory_loc (str): Path to the memory-mapped file.
        dataset_size (int): Size of the dataset.
        emd_size (int): Dimensionality of the embeddings.
        dtype (str): Data type of the embeddings.

    Returns:
        np.memmap: A memory-mapped NumPy array.
    """
    embs = np.memmap(
        embs_memory_loc, dtype=dtype, mode="r", shape=(dataset_size, emd_size)
    )
    return embs

In [11]:
class SemDeDup():

    def __init__(self, args):
        self.args = args
        random.seed(args["seed"])

    def contains_duplicates(self, arr):
        return len(np.unique(arr)) != len(arr)

    def semdedup(self, cluster, cluster_reps, device):
        st = time.time()
        ## -- compute pairwise cos sim between cluster items, then replace to diagonal with zeros to ignore self similarity
        cluster_reps.to(device)
        pair_w_sim_matrix = cluster_reps @ (cluster_reps.T)
        del cluster_reps
        pair_w_sim_matrix.fill_diagonal_(0.0)
        assert pair_w_sim_matrix.shape[0] == pair_w_sim_matrix.shape[1]

        # TODO: what are you doung with image_urls?
        ## -- get paths to cluster i images
        image_urls = cluster[:, 0]

        ## -- make sure all the paths are unique this ensure that the duplicates are really stored many time times on memory
        assert not self.contains_duplicates(image_urls)

        ## -- We need upper tringular matrix because (1)we don't need to look at self sim (always=1) (2)we need the compinations not permutations
        triu_sim_mat = torch.triu(pair_w_sim_matrix, diagonal=1)

        ## -- if the max sim between one example and any other example is > 1-eps, remove this example
        M = torch.max(triu_sim_mat, dim=0)[0].cpu()
        print(f"Step time: {time.time()-st}(s)")

        return M

    def process_clusters(self, start_cluster: int, end_cluster: int):
        # print("SemDeDup params: ", self.args)
        st = time.time()

        embs = init_memmap_embs(
            self.args["embs_memory_loc"], self.args["dataset_size"], self.args["emd_size"]
        )

        step_time = []

        for cluster_id in tqdm(range(start_cluster, end_cluster)):
            step_st = time.time()

            df_file_loc = os.path.join(
                self.args["save_loc"], f"dataframes/cluster_{cluster_id}.pkl"
            )

            if os.path.exists(df_file_loc):  # and os.path.exists(dict_file_loc):
                print(f"{df_file_loc} exists, moving on")
                continue

            ## -- load cluster i representations
            cluster_i = np.load(
                os.path.join(
                    self.args["sorted_clusters_path"], f"cluster_{cluster_id}.npy"
                )
            )
            # 1) store cluster size
            cluster_size = cluster_i.shape[0]
            print("cluster_size: ", cluster_size)

            if cluster_size == 1:
                points_to_remove_df = pd.DataFrame()
                points_to_remove_df["indices"] = [0]
                for eps in self.args["eps_list"]:
                    ## We need to remove a point from the dataset when its pairwise similarity to other point is > 1-ebs
                    points_to_remove_df[f"eps={eps}"] = [False]
                if self.args["save_loc"] != "":
                    ## --save df
                    with open(df_file_loc, "wb") as file:
                        pickle.dump(points_to_remove_df, file)
                print("DONE cluster_id ", cluster_id)
                continue

            ## -- By default, we keep hard examples from groups
            clutser_items_indices = list(range(cluster_size))
            ## -- OR: shuffle cluster to keep random example from each group
            if self.args["which_to_keep"].lower() == "random":
                random.shuffle(clutser_items_indices)
                cluster_i = cluster_i[clutser_items_indices]
            ## -- OR: reverse cluster to keep easy examples
            if self.args["which_to_keep"].lower() == "easy":
                clutser_items_indices = clutser_items_indices[::-1]
                cluster_i = cluster_i[clutser_items_indices]

            ## -- indices for cluster items in the dataset
            cluster_ids = cluster_i[:, 1].astype("int32")
            cluster_reps = embs[cluster_ids]
            cluster_reps = torch.tensor(cluster_reps)

            M = self.semdedup(cluster_i, cluster_reps, self.args["device"])

            points_to_remove_df = pd.DataFrame()
            points_to_remove_df["indices"] = clutser_items_indices

            for eps in self.args["eps_list"]:
                ## -- 5) We need to remove a point from the dataset when its pairwise similarity to other point is > 1-ebs
                eps_points_to_remove = M > 1 - eps
                points_to_remove_df[f"eps={eps}"] = eps_points_to_remove

            if self.args["save_loc"] != "":
                ## --save df
                with open(df_file_loc, "wb") as file:
                    pickle.dump(points_to_remove_df, file)

            step_time.append(time.time() - step_st)
            print("DONE cluster: ", cluster_id)

        print(
            f"DONE in {((time.time()-st)/60):.2f} minutes, Average Step time {(sum(step_time)/len(step_time)):.2f}(s)"
        )
        return

    def call(self):
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(self.args)
        start_cluster = 0
        end_cluster = 19
        self.process_clusters(start_cluster, end_cluster)


In [12]:
args = {
    "seed": 5,
    "embs_memory_loc": "/workspace/CS762_Project/SemDeDup/code_alpaca_results/emb_mmap.dat",
    "dataset_size": 200,
    "emd_size": 1024,
    "save_loc": "/workspace/CS762_Project/SemDeDup/code_alpaca_results/save_location",
    "sorted_clusters_path": "/workspace/CS762_Project/SemDeDup/code_alpaca_results/sorted_clusters",
    "eps_list": [0.9, 0.95, 0.98],
    "which_to_keep": "random",
    "device": "cuda",
}

In [13]:
semdedup = SemDeDup(args)

In [14]:
semdedup.call()

{   'dataset_size': 200,
    'device': 'cuda',
    'embs_memory_loc': '/workspace/CS762_Project/SemDeDup/code_alpaca_results/emb_mmap.dat',
    'emd_size': 1024,
    'eps_list': [0.9, 0.95, 0.98],
    'save_loc': '/workspace/CS762_Project/SemDeDup/code_alpaca_results/save_location',
    'seed': 5,
    'sorted_clusters_path': '/workspace/CS762_Project/SemDeDup/code_alpaca_results/sorted_clusters',
    'which_to_keep': 'random'}


100%|██████████| 19/19 [00:00<00:00, 15010.69it/s]

/workspace/CS762_Project/SemDeDup/code_alpaca_results/save_location/dataframes/cluster_0.pkl exists, moving on
/workspace/CS762_Project/SemDeDup/code_alpaca_results/save_location/dataframes/cluster_1.pkl exists, moving on
/workspace/CS762_Project/SemDeDup/code_alpaca_results/save_location/dataframes/cluster_2.pkl exists, moving on
/workspace/CS762_Project/SemDeDup/code_alpaca_results/save_location/dataframes/cluster_3.pkl exists, moving on
/workspace/CS762_Project/SemDeDup/code_alpaca_results/save_location/dataframes/cluster_4.pkl exists, moving on
/workspace/CS762_Project/SemDeDup/code_alpaca_results/save_location/dataframes/cluster_5.pkl exists, moving on
/workspace/CS762_Project/SemDeDup/code_alpaca_results/save_location/dataframes/cluster_6.pkl exists, moving on
/workspace/CS762_Project/SemDeDup/code_alpaca_results/save_location/dataframes/cluster_7.pkl exists, moving on
/workspace/CS762_Project/SemDeDup/code_alpaca_results/save_location/dataframes/cluster_8.pkl exists, moving on
/




ZeroDivisionError: division by zero