In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from typing import Tuple
from sentence_transformers import SentenceTransformer
import tqdm
import os
from math import ceil

## Install Local Sentence Transformer
You can download the model files from hugging face and store them locally. Then you can load the model locally

In [None]:
model = SentenceTransformer('C:/sentence_transformers/all-mpnet-base-v2')

In [None]:
def calculate_match_string(df_in: pd.DataFrame, recnum: int) -> str:
    """calculate the match string for the given row
    
    :param df_in: data frame
    :param recnum: record number
    :return: match string that will be used for creating an embedding vector index
    """
    title = df_in['title'].iloc[recnum]
    length = df_in['length'].iloc[recnum]
    artist = df_in['artist'].iloc[recnum]
    album = df_in['album'].iloc[recnum]
    year = df_in['year'].iloc[recnum]
    language = df_in['language'].iloc[recnum]
    match_string = f'{title} {length} {artist} {album} {year} {language}'
    return match_string

In [None]:
df = pd.read_csv(os.path.join('C:/Data/Musicbrainz', 'musicbrainz-200-A01.csv'))
df

In [None]:
df = df.astype({
    'TID': 'Int64',
    'CID': 'Int64',
    'CTID': 'Int32',
    'artist': 'str', 
    'album': 'str', 
    'year': 'str', 
    'id': 'str', 
    'length': 'str', 
    'title': 'str', 
    'language': 'str',
    'number': 'str'})

In [None]:
nan_cols = ['artist', 'album', 'year', 'id', 'language', 'title', 'length']
for col in nan_cols:
    df[col]  = df[col].apply(lambda x : '' if x == 'nan' else x)
df.head()

In [None]:
df['embedding_vector'] = ' '

In [None]:
chunk_size = 5000 # 5K
total_recs = len(df)
progress_bar = tqdm.tqdm(range(total_recs),
                         file=open(os.devnull, 'w'),
                         desc="Embedding progress")
last_progress = 0
for i in range(len(df)):
    match_str = calculate_match_string(df, i)
    embedding = model.encode(match_str)
    df.loc[i, 'embedding_vector'] = str(embedding.tolist())
    progress_bar.update()
    if (i % chunk_size) == 0:
        print(str(progress_bar))
print(str(progress_bar))

In [None]:
df.to_csv(os.path.join('C:/Data/Musicbrainz', 'musicbrainz_200k_with_embeddings.csv'))