In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt
import os
import ast
import tqdm
import faiss
from typing import Dict, List, Any, Tuple
from tqdm import tqdm
import random
import json

In [None]:
data_folder = 'C:/Data/Musicbrainz'

In [None]:
df = pd.read_csv(os.path.join(data_folder, 'musicbrainz_200k_with_embeddings.csv'))
df

## Convert Embedding Vectors
The embedding vectors have been stored as strings for CSV compatibility. This means that we need to convert them from string literals to a floating point array. Having done that we also want to convert those values to a numpy array of floating point values.

In [None]:
df['embedding'] = df['embedding_vector'].apply(lambda x: ast.literal_eval(x))

In [None]:
embeddings = np.stack(df["embedding"].to_numpy())

In [None]:
embeddings.shape

In [None]:
orig_embeddings = embeddings.copy()

In [None]:
def create_l2_index(db_vectors, db_ids, dims=768):
    """Create an L2 index using Faiss 

    :param db_vectors: vectors. Won't be changed
    :param db_ids: list of IDs that correspond to the vectors
    :param dims: dimensions of the vectors
    """
    index = faiss.IndexFlatL2(dims)
    index = faiss.IndexIDMap(index)
    index.add_with_ids(db_vectors, db_ids)
    return index

In [None]:
def create_cosine_similarity_index(db_vectors, db_ids, dims=768):
    """Create a cosine similarity index using Faiss 

    :param db_vectors: vectors. Will get normalized
    :param db_ids: list of IDs that correspond to the vectors
    :param dims: dimensions of the vectors
    """
    index = faiss.IndexFlatIP(dims)  # inner product
    index = faiss.IndexIDMap(index)
    faiss.normalize_L2(db_vectors)  # normalize the vectors before we add them
    index.add_with_ids(db_vectors, db_ids)
    return index

## Create Faiss Index
To create our Faiss index we need our embedding vectors to be 32-bit floating point values and we also need a corresponding list of IDs as 64-bit integer values.

In [None]:
db_vectors = embeddings.copy().astype(np.float32)

In [None]:
db_ids = df['TID'].values.astype(np.int64)

In [None]:
index = create_cosine_similarity_index(db_vectors, db_ids)

Double check that we have added all the records to the index.

In [None]:
print(f'Total records in index: {index.ntotal:,}')

### Reverse Lookup Table
Create a reverse lookup table so that we can go from an index to a **TID**

In [None]:
reverse_lookup = {v: k for k, v in enumerate(db_ids)}

Check that the reverse lookup works

In [None]:
reverse_lookup[14722]

## Fetching Nearest Neighbours
Let's perform a simple experiment to see if we can find the nearest neighbours to a given record.

In [None]:
def fetch_neighbours(index_num: int, 
                     threshold: float = 0.1, 
                     max_cluster_size: int = 20) -> pd.DataFrame:
    """Fetch neigbours up to a maxiumn cluster size and limited to a threshold value
    
    :param index_num: the index to use to fetch the ID and embedding vector
    :param threshold: threshold value to cut off nearest neighbours
    :param maximum_cluster_size: maximum cluster size for nearest neighbours
    :returns: a pandas DataFrame containing the nearest IDs and the distances
    """
    match_ids = []
    match_distances = []
    query_id = db_ids[index_num]
    query_vector = db_vectors[index_num,:].reshape([1, 768])
    similarities, similarity_ids = index.search(query_vector, max_cluster_size)
    for i in range(1, similarity_ids.shape[1]):
        similarity_id = similarity_ids[0][i]
        similarity = 1 - similarities[0][i]
        if similarity <= threshold:
            match_ids.append(similarity_id)
            match_distances.append(similarity)
        #print(f'{query_id}->{similarity_id} cosine sim: {similarity}', flush=True)
    match_df = pd.DataFrame({'TID': match_ids, 'distance': match_distances})
    return match_df

In [None]:
query_num = 2

In [None]:
print(f'Searching for nearest IDs to {db_ids[query_num]}')

In [None]:
df_match = fetch_neighbours(index_num=query_num, threshold=0.25)
df_match

Compare this with the actual cosine similarity between the 2 original vectors

In [None]:
distance_vectors = np.zeros((2, 768), dtype='float')
first_neighbour = df_match['TID'].iloc[0]
distance_vectors[0, :] = orig_embeddings[query_num, :]
distance_vectors[1, :] = orig_embeddings[reverse_lookup[first_neighbour], :]

In [None]:
cosine_dist = pdist(distance_vectors, metric='cosine')
print(f'{db_ids[query_num]}->{first_neighbour} cosine distance: {cosine_dist[0]:.3f}')

In [None]:
def create_match_group(matches_df: pd.DataFrame,
                       visited: Dict[str, bool],
                       match_group_id: str) -> List[Tuple[int, str, float]]:
    """Create the match group

    :param matches_df: all the matches
    :param visited: dictionary containing all the IDs that we have visited so far
    :param match_group_id: match group ID to assign
    """
    num_recs = 0
    ret = []
    for i in range(len(matches_df)):
        match_id = matches_df['TID'].iloc[i]
        distance = matches_df['distance'].iloc[i]
        lookup_id = reverse_lookup[match_id]
        if lookup_id not in visited.keys():
            visited[lookup_id] = True
            ret.append((match_id, match_group_id, distance))
    return ret

In [None]:
current_match_group_id = 1

In [None]:
def calc_next_match_group_id() -> str:
    """Uses the global current_match_group_id variable to calculate the next one

    :returns: the next match group ID as a string
    """
    global current_match_group_id
    current_match_group_id += 1
    return f'{current_match_group_id}'

In [None]:
#epsilon = 0.2725
#epsilon = 0.125
#epsilon = 0.2
#epsilon = 0.3
#epsilon = 0.25
epsilon = 0.245
results_filename = 'match_groups_200k_epsilon_0_245.csv'
metadata_filename = 'match_groups_200k_epsilon_0_245.json'
results_folder = 'results'
distance_metric = 'cosine_sim'
experiment_params = {'epsilon': epsilon,
                     'distance_metric': distance_metric,
                    'total_recs': len(df)}

In [None]:
current_match_group_id = 1
chunk_size = 5000 # 5K
ids_visited = {}
total_recs = len(df)
progress_bar = tqdm(range(total_recs),
                    file=open(os.devnull, 'w'),
                    desc="match group update progress")
ids = range(len(df))
ids_all = set(ids)
ids_left = set(ids)
max_chunk = 0
last_progress = 0
matches_found = []
while len(ids_left) > 0:
    i_random = random.choice(list(ids_left))
    ids_visited[i_random] = True
    matches_df = fetch_neighbours(index_num=i_random,
                                  threshold=epsilon)
    next_match_group_id = calc_next_match_group_id()
    matches = create_match_group(matches_df=matches_df,
                                 visited=ids_visited,
                                 match_group_id=next_match_group_id)
    if len(matches) > 0:
        matches_found.append((db_ids[i_random], next_match_group_id, 0))
        matches_found.extend(matches)
    ids_left = ids_all.difference(set(ids_visited.keys()))
    so_far = total_recs - len(ids_left)
    this_update = so_far - last_progress
    progress_bar.update(this_update)
    if so_far // chunk_size > max_chunk:
        print(str(progress_bar))
        max_chunk = so_far // chunk_size
    last_progress = so_far
# print the final progress chunk
this_update = total_recs - last_progress
progress_bar.update(this_update)
print(str(progress_bar))

In [None]:
df_match_groups = pd.DataFrame(matches_found, columns=['TID', 'match_group_id', 'distance'])

In [None]:
df_match_groups

## Save the results
Save the results so we can later visualise them.

In [None]:
df_match_groups.to_csv(os.path.join('results', results_filename), index=False)