In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from typing import Tuple, Any, Dict
import os

## Visualise The Match Groups
Let's load up the match groups and then colour them so we can see easily how they differ.

In [None]:
results_folder = 'results'
match_groups_df = pd.read_csv(os.path.join(results_folder, 'match_groups_200k_epsilon_0_245_.csv'))
match_groups_df.head(10)

In [None]:
match_groups_df = match_groups_df.set_index(['TID'])

Let's load up the original data so we can then join them with our match groups and see what the full records look like.

In [None]:
data_folder = 'C:/Data/Musicbrainz'
df = pd.read_csv(os.path.join(data_folder, 'musicbrainz-200-A01.csv'))
df = df.drop(['id', 'SourceID','number'], axis=1)
df

In [None]:
df = df.astype({
    'TID': 'Int64',
    'CID': 'Int64',
    'CTID': 'Int32'})

In [None]:
match_cols = ['title', 'length', 'artist', 'album', 'year', 'language']  # the attributes we used to create our "Match Sentence"
for col in match_cols:
    df = df.astype({col: 'str'})
    df[col]  = df[col].apply(lambda x : '' if x == 'nan' else x)
df.head()

## Join the Data with the Results
Join the data with the results so we can see all the fields

In [None]:
df_indexed = df.set_index('TID')
match_groups_all_df = match_groups_df.join(df_indexed, on='TID', how='left', lsuffix='_l', rsuffix='_r')
match_groups_all_df.head(10)

In [None]:
match_groups_all_df = match_groups_all_df.reset_index(names='TID')
match_groups_all_df.head(10)

In [None]:
match_groups_all_df = match_groups_all_df.drop(['CID', 'CTID'], axis=1)
match_groups_all_df.head(10)

In [None]:
match_groups_all_df["hash"] = match_groups_all_df['match_group_id'].apply(lambda x: hash(x))
match_groups_all_df

In [None]:
sel_cols = ['TID', 'title', 'length', 'artist', 'album', 'year', 'language', 'hash' ]
match_groups_all_df = match_groups_all_df[sel_cols]

## Visualise Top 200 Records
Since there's 200K records to look at, trying to do that will make visualisation difficult and slow. So we'll simply select the top 200 records and show those match groups. That will be more than enough to give us an idea of what the match groups look like. We colour each of the rows by their match groups. I use pastel colours as they are quite easy on the eyes :-)

In [None]:
df_show_sel = match_groups_all_df.head(200)  # only show the top 200 records

In [None]:
pastel_color_table = [
    "#FFB5E8",
    "#ECD4FF",
    "#85E3FF", 
    "#BFFCC6", 
    "#FFF5BA",
    "#FFCBC1",
    "#C4FAF8"
]
num_colors = len(pastel_color_table)
unique_hashes = df_show_sel["hash"].unique()
num_unique_hashes = len(unique_hashes)
colors = [pastel_color_table[i % num_colors] for i in range(num_unique_hashes)]

In [None]:
color_mapping = {}
for value in unique_hashes:
    color = colors.pop(0)
    if value not in color_mapping:
        color_mapping[value] = color

In [None]:
# Color rows
df_show_sel.style.apply(
    lambda v: [f"background-color: {color_mapping.get(v['hash'], '')}"] * df_show_sel.shape[1],
    axis=1,
#)
).hide(['TID', 'hash'], axis=1)