In [1]:
import torch
import os

os.chdir("c:/Users/cunn2/OneDrive/DSML/Project/thesis-repo")

from sms.exp1.config_classes import load_config_from_launchplan
from sms.exp1.run_training import build_encoder, build_projector
from sms.exp1.models.siamese import SiameseModel

config = load_config_from_launchplan("sms/exp1/runs/run_20240926_162652/original_launchplan.yaml")

encoder = build_encoder(config.model_dump())
projector = build_projector(config.model_dump())

model = SiameseModel(encoder, projector)

print(encoder)
print(projector)
print(model)


PianoRollConvEncoder(
  (conv_layers): Sequential(
    (0): Conv2d(1, 2, kernel_size=(10, 10), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(2, 4, kernel_size=(6, 6), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Conv2d(4, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU()
  )
  (fc): Linear(in_features=20768, out_features=64, bias=True)
)
ProjectionHead(
  (projector): Sequential(
    (0): Linear(in_features=64, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=64, bias=True)
    (5): ReLU()
  )
)
SiameseModel(
  (encoder): PianoRollConvEncoder(
    (conv_layers): S

In [2]:
pt_encoder = build_encoder(config.model_dump())
pt_encoder.load_state_dict(torch.load("sms/exp1/runs/run_20240926_162652/pretrain_saved_model.pth"))    

ft_encoder = build_encoder(config.model_dump())
ft_encoder.load_state_dict(torch.load("sms/exp1/runs/run_20240926_162652/finetune_saved_model.pth"))


  pt_encoder.load_state_dict(torch.load("sms/exp1/runs/run_20240926_162652/pretrain_saved_model.pth"))
  ft_encoder.load_state_dict(torch.load("sms/exp1/runs/run_20240926_162652/finetune_saved_model.pth"))


<All keys matched successfully>

In [3]:
data = torch.load(r"C:\Users\cunn2\OneDrive\DSML\Project\thesis-repo\data\exp1\train_data.pt")

  data = torch.load(r"C:\Users\cunn2\OneDrive\DSML\Project\thesis-repo\data\exp1\train_data.pt")


In [4]:
data[0]

array([[ 0.2, 67. ],
       [ 1. , 74. ],
       [ 2. , 76. ],
       [ 0.8, 74. ]])

In [6]:
from sms.src.synthetic_data.formatter import InputFormatter
from sms.src.synthetic_data.note_arr_mod import NoteArrayModifier
import numpy as np
import logging
from sms.src.log import configure_logging

logger = logging.getLogger(__name__)
configure_logging(console_level=logging.DEBUG)

formatter = InputFormatter(**config.model_dump()['input'])

aug_dict = {
    "use_transposition": True,
    "use_shift_selected_notes_pitch": False,
    "use_change_note_durations": False,
    "use_delete_notes": False,
    "use_insert_notes": False
}

modifier = NoteArrayModifier()

def format_data(data: np.ndarray):
    return formatter(data).astype(np.float32).copy()

anchor = data[0]
pos = modifier(anchor, aug_dict)
neg = data[18]
print(anchor)
print(pos)
print(neg)

anchor = format_data(anchor)
pos = format_data(pos)
neg = format_data(data[17])

anchor_enc = ft_encoder((torch.from_numpy(anchor)).unsqueeze(0))[0].detach().numpy()   
pos_enc = ft_encoder((torch.from_numpy(pos)).unsqueeze(0))[0].detach().numpy()
neg_enc = ft_encoder((torch.from_numpy(neg)).unsqueeze(0))[0].detach().numpy()

print(f'pos distance: {np.linalg.norm(anchor_enc - pos_enc)}')
print(f'neg distance: {np.linalg.norm(anchor_enc - neg_enc)}')


[2024-09-28 20:47:51] [DEBUG] Transposing non-rest notes by 10 semitones.


[[ 0.2 67. ]
 [ 1.  74. ]
 [ 2.  76. ]
 [ 0.8 74. ]]
tensor([[ 0.2000, 77.0000],
        [ 1.0000, 84.0000],
        [ 2.0000, 86.0000],
        [ 0.8000, 84.0000]], dtype=torch.float64)
[[ 0.75 67.  ]
 [ 0.25 69.  ]
 [ 1.   71.  ]
 [ 0.5  67.  ]
 [ 0.5  67.  ]
 [ 1.   69.  ]]
pos distance: 29.393085479736328
neg distance: 20.81243324279785


In [7]:
from typing import List

def format_data_for_conv_enc(data: np.ndarray, formatter: InputFormatter):
    return torch.from_numpy(formatter(data).astype(np.float32).copy())

def format_dataset_for_conv_enc(dataset: List[np.ndarray]):
    formatted_data = [format_data_for_conv_enc(data, formatter) for data in dataset]
    return torch.stack(formatted_data, dim=0)

data_formatted = format_dataset_for_conv_enc(data)


In [8]:
print(data_formatted.shape)
embeddings = ft_encoder(data_formatted)


torch.Size([14631, 128, 32])


In [9]:
embeddings = embeddings.detach()

In [10]:
embeddings.shape

torch.Size([14631, 64])

In [11]:
embeddings[0]

tensor([ 9.8189e+00, -1.0568e+01,  1.3061e+00, -3.7155e+00,  2.3748e+00,
        -9.1931e+00,  4.0135e+00,  7.2098e+00,  1.6388e+00,  1.6952e+01,
         3.5173e+00, -3.6373e+00,  3.9732e+00,  1.5120e+01,  1.0261e+00,
         1.1293e+01,  6.1510e+00, -4.7417e+00, -4.8012e+00, -1.4328e+01,
        -2.8075e+00, -3.0592e+00, -7.0917e+00, -7.3687e+00,  1.3666e+00,
        -2.9715e+00,  4.9831e+00,  5.1463e+00, -3.3394e+00, -6.1344e+00,
        -6.1643e+00, -1.6928e+01, -2.0705e+00,  7.5320e-01, -7.7485e+00,
        -9.8449e+00,  4.6780e+00,  3.3799e+00,  4.8790e+00, -1.0553e-02,
         2.3219e+00,  1.0051e+01, -8.1662e+00,  1.1222e+01,  1.2169e+00,
         2.3066e+00, -1.3469e+01,  4.2552e-01, -3.1998e+00, -1.3172e+00,
        -1.8167e+00,  6.6501e+00,  8.4608e-01,  1.6913e+00, -3.8155e+00,
        -4.9456e+00, -9.2409e+00,  9.4063e+00,  9.8553e+00,  3.5024e+00,
        -1.2613e+01, -2.1675e+01, -5.5904e+00, -1.0875e+00])

## evaluation

In [12]:
import faiss
import numpy as np
from typing import Dict, Any

class CustomFAISSIndex:
    def __init__(self, index_type: str, index_args: List[Any] = [], index_kwargs: Dict[str, Any] = {}):
        self.index = getattr(faiss, index_type)(*index_args, **index_kwargs)
        self.id_to_index = {}  # Maps custom IDs to FAISS indices
        self.index_to_id = {}  # Maps FAISS indices to custom IDs
        self.id_to_data = {}   # Maps custom IDs to original data

    def add_with_id(self, id, vector, original_data=None):
        if id in self.id_to_index:
            raise ValueError(f"ID {id} already exists in the index")
        
        index = self.index.ntotal
        self.index.add(np.array([vector], dtype=np.float32))
        self.id_to_index[id] = index
        self.index_to_id[index] = id
        if original_data is not None:
            self.id_to_data[id] = original_data

    def remove(self, id):
        if id not in self.id_to_index:
            raise ValueError(f"ID {id} not found in the index")
        
        index_to_remove = self.id_to_index[id]
        self.index.remove_ids(np.array([index_to_remove]))
        
        # Update mappings
        del self.index_to_id[index_to_remove]
        del self.id_to_index[id]
        if id in self.id_to_data:
            del self.id_to_data[id]
        
        # Update remaining indices
        for i in range(index_to_remove, self.index.ntotal):
            old_id = self.index_to_id[i + 1]
            self.index_to_id[i] = old_id
            self.id_to_index[old_id] = i
        del self.index_to_id[self.index.ntotal]

    def search(self, query_vector, k,):
        distances, indices = self.index.search(np.array([query_vector], dtype=np.float32), k)
        results = []
        for idx in indices[0]:
            if idx != -1 and idx in self.index_to_id:
                id = self.index_to_id[idx]
                results.append((id, self.id_to_data.get(id)))
        return results

    def get_vector(self, id):
        if id not in self.id_to_index:
            raise ValueError(f"ID {id} not found in the index")
        index = self.id_to_index[id]
        return self.index.reconstruct(index)

    def get_original_data(self, id):
        return self.id_to_data.get(id)
    
    def get_all_items(self, limit=3):
        items = []
        for id in list(self.id_to_data.keys())[:limit]:  # Limit the number of items
            vector = self.get_vector(id)
            original_data = self.get_original_data(id)
            items.append((id, vector, original_data))
        return items

    def __repr__(self):
        items = self.get_all_items(limit=3)  # Limit to 3 items
        total_items = self.index.ntotal
        repr_str = f"CustomFAISSIndex with {total_items} items:\n"
        for id, vector, original_data in items:
            repr_str += f"  ID: {id}\n"
            repr_str += f"    Vector: {vector}\n"
            repr_str += f"    Original Data: {original_data}\n"
        if total_items > 3:
            repr_str += f"  ... and {total_items - 3} more items\n"
        return repr_str



[2024-09-28 20:49:08] [DEBUG] Environment variable FAISS_OPT_LEVEL is not set, so let's pick the instruction set according to the current CPU
[2024-09-28 20:49:08] [INFO ] Loading faiss with AVX512 support.
[2024-09-28 20:49:08] [INFO ] Could not load library with AVX512 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx512'")
[2024-09-28 20:49:08] [INFO ] Loading faiss with AVX2 support.
[2024-09-28 20:49:08] [INFO ] Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
[2024-09-28 20:49:08] [INFO ] Loading faiss.
[2024-09-28 20:49:08] [INFO ] Successfully loaded faiss.


In [14]:
from uuid import uuid4

data_ids = [str(uuid4()) for _ in range(len(data))]
data_dict = dict(zip(data_ids, data))
embeddings_dict = dict(zip(data_ids, embeddings.detach().numpy()))

dim = list(embeddings_dict.values())[0].shape[0]
embedding_index = CustomFAISSIndex(index_type="IndexLSH", index_args=[dim, 256])
for key, value in embeddings_dict.items():
    embedding_index.add_with_id(key, value, data_dict[key])

### checks

In [None]:
embedding_index.get_original_data(data_ids[0])


array([[ 0.2, 67. ],
       [ 1. , 74. ],
       [ 2. , 76. ],
       [ 0.8, 74. ]])

In [None]:
# Check 1: Verify all documents are added
print("Check 1: Verify all documents are added")
for doc_id in ["doc1", "doc2", "doc3"]:
    vector = embedding_index.get_vector(doc_id)
    data = embedding_index.get_original_data(doc_id)
    print(f"{doc_id}: Vector = {vector}, Data = {data}")

# Check 2: Remove a document and verify it's gone
print("\nCheck 2: Remove a document and verify it's gone")
embedding_index.remove("doc2")
try:
    embedding_index.get_vector("doc2")
except ValueError as e:
    print(f"Expected error: {e}")

# Check 3: Verify remaining documents are still accessible
print("\nCheck 3: Verify remaining documents are still accessible")
for doc_id in ["doc1", "doc3"]:
    vector = embedding_index.get_vector(doc_id)
    data = embedding_index.get_original_data(doc_id)
    print(f"{doc_id}: Vector = {vector}, Data = {data}")

# Check 4: Add a new document and verify it's added correctly
print("\nCheck 4: Add a new document and verify it's added correctly")
embedding_index.add_with_id("doc4", np.array([4] * dim), 4)
embedding_index.add_with_id("doc5", np.array([5] * dim), 5)
vector = embedding_index.get_vector("doc4")
data = embedding_index.get_original_data("doc4")
print(f"doc4: Vector = {vector}, Data = {data}")

# Check 5: Perform a search and verify results
print("\nCheck 5: Perform a search and verify results")
query_vector = np.array([2.5] * dim)
results = embedding_index.search(query_vector, k=2)
print(f"Search results for query {query_vector}:")
for id, data in results:
    print(f"ID: {id}, Data: {data}")

# Check 6: Try to add a document with an existing ID (should raise an error)
print("\nCheck 6: Try to add a document with an existing ID")
try:
    embedding_index.add_with_id("doc1", np.array([5] * dim), 5)
except ValueError as e:
    print(f"Expected error: {e}")

# Check 7: Try to remove a non-existent document (should raise an error)
print("\nCheck 7: Try to remove a non-existent document")
try:
    embedding_index.remove("doc5")
except ValueError as e:
    print(f"Expected error: {e}")

Check 1: Verify all documents are added


NameError: name 'custom_index' is not defined

# exp1 eval loop


In [None]:
# produce vector embeddings
from uuid import uuid4
import torch
import torch.nn as nn
import faiss
import numpy as np
from typing import Callable, Optional, List, Dict
from sms.src.synthetic_data.formatter import InputFormatter
from sms.src.synthetic_data.note_arr_mod import NoteArrayModifier

from sms.exp1.run_training import build_encoder, build_projector
from sms.exp1.models.siamese import SiameseModel

data_ids = [str(uuid4()) for _ in range(len(data))]
data_dict = dict(zip(data_ids, data))
embeddings_dict = dict(zip(data_ids, embeddings.detach().numpy()))

num_loops = 1000
anchor_keys = np.random.choice(list(embeddings_dict.keys()), size=num_loops, replace=False)

def augment_chunk(chunk: np.ndarray, augmentation: str):
    """ 
    augmentation is one of the following:
        use_transposition
        use_shift_selected_notes_pitch
        use_change_note_durations
        use_delete_notes
        use_insert_notes
    """
    aug_dict = {
        "use_transposition": False,
        "use_shift_selected_notes_pitch": False,
        "use_change_note_durations": False,
        "use_delete_notes": False,
        "use_insert_notes": False
    }
    aug_dict[augmentation] = True
    modifier = NoteArrayModifier()
    return modifier(chunk, aug_dict)

def create_augmented_data(data_dict: Dict[str, np.ndarray], anchor_keys: List[str]) -> Dict[str, Dict[str, np.ndarray]]:
    """
    Create the augmented data for the given anchor keys.
    Returns a dictionary of dictionaries, where the outer dictionary is keyed by the anchor keys, and the inner dictionary 
        is keyed by the type of augmentation and contains the augmented data.
    """
    augmented_data = {}
    for key in anchor_keys:
        chunk = data_dict[key]
        augmented_data[key] = {
            "chunk_transposed": augment_chunk(chunk, "use_transposition"),
            "chunk_one_pitch_shifted": augment_chunk(chunk, "use_shift_selected_notes_pitch"),
            "chunk_note_duration_changed": augment_chunk(chunk, "use_change_note_durations"),
            "chunk_note_deleted": augment_chunk(chunk, "use_delete_notes"),
            "chunk_note_inserted": augment_chunk(chunk, "use_insert_notes")
        }
    return augmented_data

def build_model(dumped_lp_config: Dict[str, Any], model_path: str, use_pt: bool = False):
    """
    If use_pt is true, we assume the model_path contains weights for the full Siamese model.
    If use_pt is false, we assume the model_path contains weights for the encoder only.
    """
    encoder = build_encoder(dumped_lp_config)
    projector = build_projector(dumped_lp_config)
    model = SiameseModel(encoder, projector)
    if use_pt:
        model.load_state_dict(torch.load(model_path))
    else:
        model = model.get_encoder()
        model.load_state_dict(torch.load(model_path))
    return model

def create_embedding_dicts(data_dict: Dict[str, np.ndarray], dumped_lp_config: Dict[str, Any], model: Callable) -> Dict[str, np.ndarray]:
    """
    Create the embedding dictionary for the given model. The dumped_lp_config is used to determine the input format of the model.
    """
    formatter = InputFormatter(**dumped_lp_config['input'])
    formatted_data_list = [torch.from_numpy(formatter(chunk).astype(np.float32).copy()) for chunk in data_dict.values()]
    formatted_data_stacked = torch.stack(formatted_data_list, dim=0) # shape [num_chunks, *input_shape]
    embeddings_stacked = model(formatted_data_stacked)
    embeddings_dict = {key: embeddings_stacked[i].detach().numpy() for i, key in enumerate(data_dict.keys())}
    return embeddings_dict

def embeddings_to_faiss_index(
        embeddings_dict: Dict[str, np.ndarray], 
        index_type: str, 
        index_args: List[Any] = [], 
        index_kwargs: Dict[str, Any] = {}
    ) -> CustomFAISSIndex:

    embedding_index = CustomFAISSIndex(index_type=index_type, index_args=index_args, index_kwargs=index_kwargs)
    for key, value in embeddings_dict.items():
        embedding_index.add_with_id(key, value, data_dict[key])
    return embedding_index

    # For each embedding collection in embeddings_dicts, we perform the augmentation evaluation experiment num_loops times.
    # An augmentation evaluation experiment involves the following steps:
    # - Randomly select an anchor from data_dict
    # - Remove the anchor from data_dict
    # - Apply each of the five given augmentations to the anchor
    # - For each of the augmented melodies, add it to the database and perform a nearest neighbor search on the FAISS index
    # - Calculate the precision and recall of the search for each k in k_list

def evaluate_top_k(
        embedding_dict: Dict[str, Dict[str, np.ndarray]],
        augment_dict: Dict[str, Dict[str, np.ndarray]], 
        k_list: List[int], 
        index: CustomFAISSIndex
    ) -> None:
    """
    index is a CustomFAISSIndex object which has been initialized with the embeddings_dict.
    For each of the keys in augment_dict, we perform the following steps:
    - Remove the anchor (embedding_dict[key]) from the index
    - Add one of the augmentations from that key to the index
    - Perform a nearest neighbor search on the index using the anchor and record the position of the augmentation
    - Repeat for each augmentation
    
    Then we report the average precision and recall for each k in k_list.
    
    Args:
        embeddings_dict: dictionary of embeddings, keyed by data ids
        augment_dict: dictionary keyed by a subset of the ids in embeddings_dict, containing dictionaries of augmented data
        k_list: list of k values to evaluate
        num_loops: number of loops to perform
    """
    results = {aug_type: {k: {'precision': [], 'recall': []} for k in k_list} for aug_type in augment_dict[list(augment_dict.keys())[0]].keys()}
    
    for anchor_id, augmentations in augment_dict.items():
        anchor_embedding = embedding_dict[anchor_id]
        
        # Remove anchor from index
        index.remove(anchor_id)
        
        for aug_type, augmented_data in augmentations.items():
            # Add augmented data to index
            aug_id = f"{anchor_id}_aug_{aug_type}"
            index.add_with_id(aug_id, augmented_data, original_data=augmented_data)
            
            # Perform search
            search_results = index.search(anchor_embedding, max(k_list))
            
            # Calculate precision and recall for each k
            for k in k_list:
                top_k_results = search_results[:k]
                true_positives = sum(1 for id, _ in top_k_results if id == aug_id)
                
                precision = true_positives / k
                recall = 1 if true_positives > 0 else 0  # Recall is 1 if found, 0 if not
                
                results[aug_type][k]['precision'].append(precision)
                results[aug_type][k]['recall'].append(recall)
            
            # Remove augmented data from index
            index.remove(aug_id)
        
        # Add anchor back to index
        index.add_with_id(anchor_id, anchor_embedding, original_data=index.get_original_data(anchor_id))
    
    # Calculate average precision and recall
    for aug_type in results:
        for k in k_list:
            results[aug_type][k]['avg_precision'] = np.mean(results[aug_type][k]['precision'])
            results[aug_type][k]['avg_recall'] = np.mean(results[aug_type][k]['recall'])
    
    return results
            





ModuleNotFoundError: No module named 'uuid4'

https://python.langchain.com/docs/integrations/vectorstores/faiss/#similarity-search-with-filtering

In [None]:
dshape = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dshape)

ids = list(range(len(embeddings)))

index.add_with_ids(embeddings.detach(), ids)


RuntimeError: Error in void __cdecl faiss::Index::add_with_ids(__int64,const float *,const __int64 *) at D:\bld\faiss-split_1723208824085\work\faiss\Index.cpp:45: add_with_ids not implemented for this type of index

In [None]:
index.add_with_ids

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001CF344E2A90> >

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score
from typing import List, Tuple
import random

# Assuming you have these functions implemented
from your_module import get_embedding, apply_augmentation

def top_k_query(query_embedding: np.ndarray, database: List[np.ndarray], k: int) -> List[int]:
    distances = [np.linalg.norm(query_embedding - db_embedding) for db_embedding in database]
    return np.argsort(distances)[:k]

def evaluate_top_k(dataset: List[np.ndarray], k: int, num_tests: int = 1000) -> Tuple[float, float]:
    all_precisions = []
    all_recalls = []
    
    augmentations = ['transpose', 'tempo_change', 'add_noise', 'remove_notes', 'add_notes', 'change_octave']
    
    for _ in range(num_tests):
        # Randomly select an anchor
        anchor_idx = random.randint(0, len(dataset) - 1)
        anchor = dataset[anchor_idx]
        
        # Remove anchor from dataset
        reduced_dataset = dataset[:anchor_idx] + dataset[anchor_idx+1:]
        
        for aug in augmentations:
            # Apply augmentation to anchor
            augmented_anchor = apply_augmentation(anchor, aug)
            
            # Get embeddings
            query_embedding = get_embedding(augmented_anchor)
            database_embeddings = [get_embedding(chunk) for chunk in reduced_dataset]
            
            # Perform top-K query
            top_k_results = top_k_query(query_embedding, database_embeddings, k)
            
            # Calculate precision and recall
            # Assuming the original anchor should be the only true positive
            true_positives = 1 if anchor_idx in top_k_results else 0
            precision = true_positives / k
            recall = true_positives / 1  # Only one relevant item
            
            all_precisions.append(precision)
            all_recalls.append(recall)
    
    avg_precision = np.mean(all_precisions)
    avg_recall = np.mean(all_recalls)
    
    return avg_precision, avg_recall

# Load your dataset of melody chunks
dataset = load_melody_chunks()  # Implement this function to load your dataset

# Perform evaluation
k_values = [1, 5, 10, 20]
for k in k_values:
    precision, recall = evaluate_top_k(dataset, k)
    print(f"Top-{k} Results:")
    print(f"Average Precision: {precision:.4f}")
    print(f"Average Recall: {recall:.4f}")
    print()

ModuleNotFoundError: No module named 'sklearn'

### evaluation code plan:

- faiss database:
  - load val_data.pt
  - get embeddings from given encoder
  - store in database with way to retrieve index and remove/add vectors
  
- evlauation