In [None]:
import pandas as pd
import os
import numpy as np
import umap
import ast
from typing import Dict, List, Any, Tuple
from tqdm import tqdm
import plotly.express as px
import matplotlib.pyplot as plt
import faiss
import random

In [None]:
data_folder = 'C:/Data/Musicbrainz'

In [None]:
match_sentence_cols = ['artist', 'title', 'album', 'year', 'language']

In [None]:
df = pd.read_csv(os.path.join(data_folder, 'musicbrainz_200k_with_embeddings.csv'))
df.drop(columns=["Unnamed: 0"], inplace=True)
df

In [None]:
df = df.astype({
    'TID': 'Int64',
    'CID': 'Int64',
    'CTID': 'Int32'
})

In [None]:
for col in match_sentence_cols:
    df = df.astype({col: 'str'})
    df[col]  = df[col].apply(lambda x : '' if x == 'nan' else x)
df.head()

In [None]:
df['embedding'] = df['embedding_vector'].apply(lambda x: ast.literal_eval(x))

In [None]:
embeddings = np.stack(df["embedding"].to_numpy())

In [None]:
embeddings.shape

In [None]:
orig_embeddings = embeddings.copy()

In [None]:
def create_cosine_similarity_index(db_vectors, db_ids, dims=768):
    """Create a cosine similarity index using Faiss 

    :param db_vectors: vectors. Will get normalized
    :param db_ids: list of IDs that correspond to the vectors
    :param dims: dimensions of the vectors
    """
    index = faiss.IndexFlatIP(dims)  # inner product
    index = faiss.IndexIDMap(index)
    faiss.normalize_L2(db_vectors)  # normalize the vectors before we add them
    index.add_with_ids(db_vectors, db_ids)
    return index

## Create Faiss Index
To create our Faiss index we need our embedding vectors to be 32-bit floating point values and we also need a corresponding list of IDs as 64-bit integer values.

In [None]:
db_vectors = embeddings.copy().astype(np.float32)

In [None]:
db_ids = df['TID'].values.astype(np.int64)

In [None]:
index = create_cosine_similarity_index(db_vectors, db_ids)

In [None]:
print(f'Total records in index: {index.ntotal:,}')

In [None]:
reverse_lookup = {v: k for k, v in enumerate(db_ids)}

In [None]:
def fetch_nearest_neighbour(index_num: int) -> pd.DataFrame:
    """Fetch the nearest neigbour
    
    :param index_num: the index to use to fetch the ID and embedding vector
    :returns: a pandas DataFrame containing the nearest IDs and the distances
    """
    match_ids = []
    match_distances = []
    query_id = db_ids[index_num]
    query_vector = db_vectors[index_num,:].reshape([1, 768])
    similarities, similarity_ids = index.search(query_vector, 2)
    for i in range(1, similarity_ids.shape[1]):
        similarity_id = similarity_ids[0][i]
        similarity = 1 - similarities[0][i]
        match_ids.append(similarity_id)
        match_distances.append(similarity)
        #print(f'{query_id}->{similarity_id} cosine sim: {similarity}', flush=True)
    match_df = pd.DataFrame({'TID': match_ids, 'distance': match_distances})
    return match_df

In [None]:
df_match = fetch_nearest_neighbour(index_num=0)
df_match

In [None]:
tids = []
nearest_tids = []
distances = []
chunk_size = 5000 # 5K
ids_visited = {}
total_recs = len(df)
progress_bar = tqdm(range(total_recs),
                    file=open(os.devnull, 'w'),
                    desc="nearest neighbours progress")
ids = range(len(df))
ids_all = set(ids)
ids_left = set(ids)
max_chunk = 0
last_progress = 0
matches_found = []
while len(ids_left) > 0:
    i_random = random.choice(list(ids_left))
    ids_visited[i_random] = True
    matches_df = fetch_nearest_neighbour(index_num=i_random)
    matches_found.append((db_ids[i_random], matches_df['TID'].iloc[0], matches_df['distance'].iloc[0]))
    ids_left = ids_all.difference(set(ids_visited.keys()))
    so_far = total_recs - len(ids_left)
    this_update = so_far - last_progress
    progress_bar.update(this_update)
    if so_far // chunk_size > max_chunk:
        print(str(progress_bar))
        max_chunk = so_far // chunk_size
    last_progress = so_far
# print the final progress chunk
this_update = total_recs - last_progress
progress_bar.update(this_update)
print(str(progress_bar))

In [None]:
df_nearest_neighbour = pd.DataFrame(matches_found, columns=['TID', 'nearest_tid', 'distance'])
df_nearest_neighbour

## 2D UMAP Embeddings
Let's reduce the 768 embedding vector to a 2D vector space using UMAP.

In [None]:
umap_2d = umap.UMAP(n_components=2)
proj_2d = umap_2d.fit_transform(embeddings)

In [None]:
df_nearest_neighbour["embedding_reduced_2d"] = np.array(proj_2d).tolist()

In [None]:
df_nearest_neighbour["emb_2d_x"] = [emb[0] for emb in df_nearest_neighbour["embedding_reduced_2d"]]
df_nearest_neighbour["emb_2d_y"] = [emb[1] for emb in df_nearest_neighbour["embedding_reduced_2d"]]

## 3D UMAP Embeddings
Let's reduce the 768 embedding vector to a 3D vector space using UMAP.

In [None]:
umap_3d = umap.UMAP(n_components=3)
proj_3d = umap_3d.fit_transform(embeddings)

In [None]:
df_nearest_neighbour["embedding_reduced_3d"] = np.array(proj_3d).tolist()

In [None]:
df_nearest_neighbour["emb_3d_x"] = [emb[0] for emb in df_nearest_neighbour["embedding_reduced_3d"]]
df_nearest_neighbour["emb_3d_y"] = [emb[1] for emb in df_nearest_neighbour["embedding_reduced_3d"]]
df_nearest_neighbour["emb_3d_z"] = [emb[2] for emb in df_nearest_neighbour["embedding_reduced_3d"]]

## Join with original records
We need to join with the original records in order to show the record details when we hover the mouse over a point.

In [None]:
df_nearest_neighbour.drop(columns=["embedding_reduced_2d", "embedding_reduced_3d"], inplace=True)

In [None]:
df.drop(columns=["embedding_vector", "CID", "CTID", "SourceID", "id", "number"], inplace=True)

In [None]:
df = df.set_index(['TID'])

In [None]:
df = df.join(df_nearest_neighbour.set_index(['TID']))

In [None]:
df = df.reset_index(names=['TID'])
df.head()

## Save the 2D & 3D mappings
It's taken a lot of time to get here, so save the data together with the 2D and 3D mappings. That way we can simply jump to the following step and load the data if we ever want to see the 2D and 3D maps at a later point.

In [None]:
df.to_csv(os.path.join(data_folder, 'musicbrainz_200k_full_with_embeddings_2d_and_3d_distance.csv'))

# Load Embeddings with 2D & 3D mappings
To save a lot of time, load the data containing the 2D and 3D mappings.

In [None]:
df = pd.read_csv(os.path.join(data_folder, 'musicbrainz_200k_full_with_embeddings_2d_and_3d_distance.csv'))

In [None]:
for col in match_sentence_cols:
    df = df.astype({col: 'str'})
for col in match_sentence_cols:
    df[col]  = df[col].apply(lambda x : '' if x == 'nan' else x)
df.drop(columns=["Unnamed: 0.1", "Unnamed: 0"], inplace=True)
df.head()

## 2D UMAP Embeddings
Let's reduce the 768 embedding vector to a 2D vector space using UMAP.

In [None]:
fig_2d = px.scatter(
    df,
    x='emb_2d_x',
    y='emb_2d_y',
    color=df['distance'],
    hover_data=match_sentence_cols+['TID'],
    width=1600, height=1200
)
fig_2d.show()

In [None]:
fig_2d.write_image('2d_umap_200k.png')

## 3D UMAP Embeddings
Let's reduce the 768 embedding vector to a 3D vector space using UMAP.

In [None]:
fig_3d = px.scatter_3d(
    df, 
    x='emb_3d_x', 
    y='emb_3d_y', 
    z='emb_3d_z',
    color=df['distance'],
    hover_data=match_sentence_cols+['TID'],
    width=1600, height=1200
)
fig_3d.update_traces(marker_size=5)
fig_3d.show()