In [10]:
import logging
import yaml
import argparse
import torch
import pickle as pkl
import numpy as np
from typing import List, Dict, Any
from uuid import uuid4
import os

os.chdir(r"C:\Users\cunn2\OneDrive\DSML\Project\thesis-repo")

from sms.src.log import configure_logging
from sms.src.vector_search.evaluate_top_k import create_augmented_data, build_model, create_embedding_dict, embeddings_to_faiss_index, evaluate_top_k

from pydantic import BaseModel
from sms.exp1.config_classes import LaunchPlanConfig, load_config_from_launchplan
from sms.exp1.run_evaluation import run_evaluation, ModelEvalConfig

logger = logging.getLogger(__name__)
configure_logging(console_level=logging.INFO)

data = torch.load(r"data\exp1\val_data.pt")
data_ids = [str(uuid4()) for _ in range(len(data))]
data_dict = dict(zip(data_ids, data))

class IndexConfig(BaseModel):
    index_type: str
    index_args: List[Any] = []
    index_kwargs: Dict[str, Any] = {}

trans_rel_lp_cfg = load_config_from_launchplan(r"sms\exp1\runs\transformer_rel_1\original_launchplan.yaml")

trans_rel_1_full = ModelEvalConfig(
    name="trans_rel_1_full",
    lp_config=trans_rel_lp_cfg,
    mod_path=r"sms\exp1\runs\transformer_rel_1\pretrain_saved_model.pth",
    path_type='full',
    use_full_model=True
)

dim = trans_rel_lp_cfg.model_dump()['dims']['d_projected']

idx_cfg = IndexConfig(index_type="IndexLSH", index_args=[dim, 32])

def run_evaluation(
    data_dict: Dict[str, np.ndarray],
    num_loops: int,
    model_configs: List[ModelEvalConfig],
    index_config: IndexConfig
    ) -> Dict[str, Dict[str, Dict[str, Dict[str, List[float]]]]]:

    # generate random augmentations
    anchor_keys = np.random.choice(list(data_dict.keys()), size=num_loops, replace=False)
    augmented_data = create_augmented_data(data_dict, anchor_keys)

    results = {}
    for eval_config in model_configs:
        logger.info(f"Running evaluation for {eval_config.name}")

        dumped_lp_config = eval_config.lp_config.model_dump()
        bm_cfg = {'full_model_path': eval_config.mod_path} if eval_config.path_type == 'full' else {'encoder_path': eval_config.mod_path}

        model = build_model(dumped_lp_config, **bm_cfg, use_full_model=eval_config.use_full_model)
        embeddings_dict = create_embedding_dict(data_dict, dumped_lp_config, model)
        logger.info(f"Created embedding dictionary for {len(embeddings_dict)} keys.")
        # create augmented embeddings structure
        augmented_embeddings_dict = {}
        for data_id, aug_dict in augmented_data.items():
            augmented_embeddings_dict[data_id] = create_embedding_dict(aug_dict, dumped_lp_config, model)
        logger.info(f"Created augmented embeddings.")

        index = embeddings_to_faiss_index(embeddings_dict=embeddings_dict, **index_config.model_dump())
        logger.info(f"Created FAISS index.")
        
        results[eval_config.name] = evaluate_top_k(embeddings_dict, augmented_embeddings_dict, [1, 3, 5, 10, 25, 50, 100], index)
        logger.info(f"Evaluated top K.")
    return results

# results = run_evaluation(data_dict, 100, [trans_rel_1_full], idx_cfg)


  data = torch.load(r"data\exp1\val_data.pt")


# testing searches on the indices we want to use

we want to use

IVF, PQ, HNSW, LSH

In [11]:
class IndexConfig(BaseModel):
    index_type: str
    index_args: List[Any] = []
    index_kwargs: Dict[str, Any] = {}

trans_rel_lp_cfg = load_config_from_launchplan(r"sms\exp1\runs\transformer_rel_1\original_launchplan.yaml")

trans_rel_1_full = ModelEvalConfig(
    name="trans_rel_1_full",
    lp_config=trans_rel_lp_cfg,
    mod_path=r"sms\exp1\runs\transformer_rel_1\pretrain_saved_model.pth",
    path_type='full',
    use_full_model=True
)

model = build_model(trans_rel_lp_cfg.model_dump(), full_model_path=r"sms\exp1\runs\transformer_rel_1\pretrain_saved_model.pth", use_full_model=trans_rel_1_full.use_full_model)
embeddings_dict = create_embedding_dict(data_dict, trans_rel_lp_cfg.model_dump(), model)
idx = embeddings_to_faiss_index(embeddings_dict, index_type="IndexFlatL2", index_args=[dim])


In [12]:
dim = list(embeddings_dict.values())[0].shape[0]
print(dim)

64


In [1]:
import torch
import numpy as np
import os
from typing import Tuple, List, Dict
import logging

logger = logging.getLogger(__name__)

os.chdir(r"C:\Users\cunn2\OneDrive\DSML\Project\thesis-repo")

def select_keys(data_dict: Dict[str, np.ndarray], shuffle: bool = True) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
    """
    Shuffle the keys and select the required number of keys for each dataset size.
    
    Returns two dictionaries:
    - subset_keys: Maps dataset sizes to their subset of keys.
    - selected_keys: Maps dataset sizes to their selected keys for augmentation.
    """
    keys = list(data_dict.keys())
    if shuffle:
        np.random.shuffle(keys)
        logger.info("Shuffled the dataset keys.")
    
    selection_plan = [
        (5000, 100, '5k'),
        (10000, 200, '10k'),
        (100000, 1000, '100k'),
        (500000, 2000, '500k'),
        (None, 10000, '1m') 
    ]

    subset_keys = {}
    selected_keys = {}
    start = 0
    total_keys = len(keys)

    for size, count, label in selection_plan:
        if size is not None:
            end = start + size
            subset = keys[start:end]
            subset_keys[label] = subset
            selected = subset[:count]
            selected_keys[label] = selected
            print(f"Selected {len(selected)} keys for dataset size {label} from subset of {len(subset)} keys.")
            start = end
            if start >= total_keys:
                logger.warning(f"Reached the end of the dataset at size {label}.")
                break
        else:
            # For 1m, select from the remaining keys
            selected = keys[start:start + count]
            selected_keys[label] = selected
            subset_keys[label] = keys
            logger.info(f"Selected {len(selected)} keys for dataset size {label} from the entire remaining dataset.")
    
    return subset_keys, selected_keys

ds = torch.load(r"data/exp2/million_chunks.pt")

  ds = torch.load(r"data/exp2/million_chunks.pt")


In [2]:
vals = list(ds.values())

In [5]:
lengths = np.array([len(val) for val in vals])

In [7]:
sum(lengths<3)

18639

In [None]:
lengths 

In [13]:
import numpy as np
import matplotlib.pyplot as plt

# Calculate the lengths of each item in vals
lengths = [len(item) for item in vals]

# Calculate basic statistics
min_length = np.min(lengths)
max_length = np.max(lengths)
mean_length = np.mean(lengths)
median_length = np.median(lengths)

# Create a histogram
plt.figure(figsize=(10, 6))
plt.hist(lengths, bins=50, edgecolor='black')
plt.title('Distribution of Lengths in vals')
plt.xlabel('Length')
plt.ylabel('Frequency')

# Add vertical lines for mean and median
plt.axvline(mean_length, color='r', linestyle='dashed', linewidth=2, label=f'Mean: {mean_length:.2f}')
plt.axvline(median_length, color='g', linestyle='dashed', linewidth=2, label=f'Median: {median_length:.2f}')

plt.legend()

# Add text box with statistics
stats_text = f'Min: {min_length}\nMax: {max_length}\nMean: {mean_length:.2f}\nMedian: {median_length:.2f}'
plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes, verticalalignment='top', horizontalalignment='right', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.show()

print(f"Minimum length: {min_length}")
print(f"Maximum length: {max_length}")
print(f"Mean length: {mean_length:.2f}")
print(f"Median length: {median_length:.2f}")


: 

: 

In [10]:
a, b = select_keys(ds)

Selected 100 keys for dataset size 5k from subset of 5000 keys.
Selected 200 keys for dataset size 10k from subset of 10000 keys.
Selected 1000 keys for dataset size 100k from subset of 100000 keys.
Selected 2000 keys for dataset size 500k from subset of 500000 keys.


In [8]:
len(b['5k'])

100

In [13]:
import faiss
# flat
idx_flat = embeddings_to_faiss_index(embeddings_dict, index_type="IndexFlatL2", index_args=[dim])
# IVF
quantizer = faiss.IndexFlatL2(dim)
nlist = np.sqrt(len(embeddings_dict))
idx_IVF = embeddings_to_faiss_index(embeddings_dict, index_type="IndexIVFFlat", index_args=[quantizer, int(dim), int(nlist)])
# PQ (cant do radius search)
M = 8
nbits = 8
idx_PQ = embeddings_to_faiss_index(embeddings_dict, index_type="IndexPQ", index_args=[dim, M, nbits]) 
# HNSW
M = 32
idx_HNSW = embeddings_to_faiss_index(embeddings_dict, index_type="IndexHNSWFlat", index_args=[dim, M])
# LSH (cant do radius search)
nbits = 64
idx_LSH = embeddings_to_faiss_index(embeddings_dict, index_type="IndexLSH", index_args=[dim, nbits])


In [20]:
idx = idx_LSH

In [21]:
idx.index.ntotal

3658

In [16]:
v1 = list(embeddings_dict.values())[0]
v2 = list(embeddings_dict.values())[5]
i2 = list(embeddings_dict.keys())[5]

diff = np.linalg.norm(v1 - v2)
print(diff)

0.80692863


# exp2 code

In [None]:
import logging
import yaml
import argparse
import torch
import pickle as pkl
import numpy as np
from typing import List, Dict, Any

from sms.src.log import configure_logging
from sms.src.vector_search.evaluate_top_k import create_augmented_data, build_model, create_embedding_dict, embeddings_to_faiss_index, evaluate_top_k

from pydantic import BaseModel
from sms.exp1.config_classes import LaunchPlanConfig

logger = logging.getLogger(__name__)
configure_logging()

class ModelEvalConfig(BaseModel):
    name: str
    lp_config: LaunchPlanConfig
    mod_path: str
    path_type: str    #'full' or 'encoder'
    use_full_model: bool

class IndexConfig(BaseModel):
    index_type: str
    index_args: List[Any] = []
    index_kwargs: Dict[str, Any] = {}

def run_evaluation(
    data_dict: Dict[str, np.ndarray],
    num_loops: int,
    model_configs: List[ModelEvalConfig],
    index_configs: List[IndexConfig]
    ) -> Dict[str, Dict[str, Dict[str, Dict[str, List[float]]]]]:

    """
    An extension of the evaluation code in exp1.
    Runs topK evaluation for each model config and each index config.

    Returns a results dictionary, which has the following structure:
    {
        model_name: {
            index_name: {
                topK: {
                    'precision': [],
                    'recall': [],
                    'f1': []
    

    Args:
        data_dict: dictionary of data, where each value is a numpy array.
        num_loops: number of loops to run.
        model_configs: list of model configurations.
        index_configs: list of index configurations.

    Returns:
        results: dictionary of results, where each value is a dictionary of topK evaluation results.
    """

    # generate random augmentations
    anchor_keys = np.random.choice(list(data_dict.keys()), size=num_loops, replace=False)
    augmented_data = create_augmented_data(data_dict, anchor_keys)

    results = {}
    for eval_config in model_configs:
        logger.info(f"Running evaluation for {eval_config.name}")

        dumped_lp_config = eval_config.lp_config.model_dump()
        bm_cfg = {'full_model_path': eval_config.mod_path} if eval_config.path_type == 'full' else {'encoder_path': eval_config.mod_path}

        model = build_model(dumped_lp_config, **bm_cfg, use_full_model=eval_config.use_full_model)
        embeddings_dict = create_embedding_dict(data_dict, dumped_lp_config, model)
        logger.info(f"Created embedding dictionary for {len(embeddings_dict)} keys.")

        # create augmented embeddings structure
        augmented_embeddings_dict = {}
        for data_id, aug_dict in augmented_data.items():
            augmented_embeddings_dict[data_id] = create_embedding_dict(aug_dict, dumped_lp_config, model)
        logger.info(f"Created augmented embeddings.")

        dim = list(embeddings_dict.values())[0].shape[0]

        #TODO: record embedding dimension

        # FLATL2 baseline 
        index_config = IndexConfig(index_type="IndexFlatL2", index_args=[dim])
        index = embeddings_to_faiss_index(embeddings_dict=embeddings_dict, **index_config.model_dump())
        logger.info(f"Created FAISS index with parameters {index_config.model_dump()}")
        results[eval_config.name] = evaluate_top_k(embeddings_dict, augmented_embeddings_dict, [1, 3, 5, 10, 25, 50, 100], index)
        logger.info(f"Evaluated top K.")
        #TODO: add to CustomFAISSINdex the details of the index, like bytes used for each embedding, databse memory usage.

        #TODO: make sure timings are recorded
        for index_config in index_configs:
            index_config_dict = index_config.model_dump()
            index = embeddings_to_faiss_index(embeddings_dict=embeddings_dict, **index_config_dict)
            logger.info(f"Created FAISS index with parameters {index_config_dict}")
            results[eval_config.name] = evaluate_top_k(embeddings_dict, augmented_embeddings_dict, [1, 3, 5, 10, 25, 50, 100], index)
            logger.info(f"Evaluated top K.")
    return results

def main(data_path: str, num_loops: int, model_config_paths: List[str], output_path: str):
    data_dict = pkl.load(open(data_path, 'rb'))
    model_configs = []
    for config_path in model_config_paths:
        with open(config_path, 'r') as file:
            config_data = yaml.safe_load(file)
        try:
            model_config = ModelEvalConfig(**config_data)
            model_configs.append(model_config)
        except pydantic.ValidationError as e:
            logger.error(f"Invalid configuration in {config_path}: {e}")
            raise
    results = run_evaluation(data_dict, num_loops, model_configs)
    pkl.dump(results, open(output_path, 'wb'))

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run model evaluation.")
    parser.add_argument('data_path', type=str, help='Path to the data file.')
    parser.add_argument('num_loops', type=int, help='Number of loops for evaluation.')
    parser.add_argument('model_config_paths', type=str, nargs='+', help='Paths to model configuration files.')
    parser.add_argument('output_path', type=str, help='Path to the output file.')
    
    args = parser.parse_args()
    main(args.data_path, args.num_loops, args.model_config_paths, args.output_path)

usage: ipykernel_launcher.py [-h]
                             data_path num_loops model_config_paths
                             [model_config_paths ...] output_path
ipykernel_launcher.py: error: the following arguments are required: data_path, num_loops, model_config_paths, output_path


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
