# Evaluate Clustering
Evaluate how well our clustering did with the given epsilon value. This will read the experimental results file.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from typing import Tuple
import os
from math import ceil
import seaborn as sns

In [None]:
data_folder = 'C:/Data/Musicbrainz'
results_folder = 'C:/Logs/musicbrainz'
results_filename = 'match_groups_200k_epsilon_0_245.csv'

In [None]:
df = pd.read_csv(os.path.join(data_folder, 'musicbrainz-200-A01.csv'))
df

Clean up the data by removing all the empy entires.

In [None]:
df = df.astype({
    'TID': 'Int64',
    'CID': 'Int64',
    'CTID': 'Int32'})

In [None]:
nan_cols = ['artist', 'album', 'year', 'id', 'language', 'title', 'length', 'number']
for col in nan_cols:
    df = df.astype({col: 'str'})
    df[col]  = df[col].apply(lambda x : '' if x == 'nan' else x)
df.head()

## Calculate Ground Truth Stats
Before we continue let's calculate some stats that we'll need later on.

In [None]:
df_cluster_group = df.groupby(['CID'])
cluster_df = df_cluster_group.agg({'CID': 'max', 'TID': 'count'}).reset_index(drop=True)
cluster_df = cluster_df.rename(columns={'CID': 'cluster_id', 'TID': 'num_ids'})
cluster_df

Calculate the total possible number of **TID**s that can be in a cluster group. These are those cluster groups that have 2 or more **TID**s in.

In [None]:
tot_dups_df = cluster_df[cluster_df['num_ids'] > 1].agg({'num_ids': 'sum'}).reset_index()
tot_possible_dups = tot_dups_df.iloc[0,1]
print(f'total possible duplicates: {tot_possible_dups:,}')

Calculate the total possible number of **TID**s that can't be in a cluster group, i.e. those cluster IDs that only have one **TID**.

In [None]:
tot_non_dups_df = cluster_df[cluster_df['num_ids'] == 1].agg({'num_ids': 'sum'}).reset_index()
tot_possible_non_dups = tot_non_dups_df.iloc[0,1]
print(f'total possible non-duplicates: {tot_possible_non_dups:,}')

## Read Results and join with the original records
First make **TID** the key on the original records, so that we can join the results with that. Then read the results and do the join.

In [None]:
df = df.set_index('TID')
df

In [None]:
df_match_groups = pd.read_csv(os.path.join(results_folder, results_filename))
df_match_groups.head()

In [None]:
df_match_groups_full = df_match_groups.join(df, on='TID', how='left', lsuffix='_l', rsuffix='_r')
df_match_groups_full = df_match_groups_full[['TID', 'CTID', 'CID', 'match_group_id', 'distance']]
df_match_groups_full

In [None]:
df_vals_full = df_match_groups_full.groupby('match_group_id').agg({'CID': 'unique', 'TID': 'count'}).reset_index()
df_vals_full

In [None]:
df_vals_full['cluster_ids'] = df_vals_full['CID'].apply(lambda x: '[' + ','.join([str(s) for s in x]) + ']')

In [None]:
df_vals_full['num_cluster_ids'] = df_vals_full['CID'].apply(lambda x: len(x))
df_vals_full = df_vals_full.rename(columns={'TID': 'num_ids_in_match_group'})
df_vals_full = df_vals_full[['match_group_id', 'num_ids_in_match_group', 
                             'num_cluster_ids', 'cluster_ids']].sort_values(['num_cluster_ids'], ascending=False)
df_vals_full

## Single Cluster IDs
Let's identify the single clusters that we've found and see how well we've done matching those.

In [None]:
df_vals_single_cluster = df_vals_full[df_vals_full['num_cluster_ids'] == 1].sort_values(['num_ids_in_match_group'], ascending=False)
df_vals_single_cluster

In [None]:
df_vals_single_cluster['cluster_id'] = df_vals_single_cluster['cluster_ids'].apply(lambda x: int(str(x)[1:-1]))
df_vals_single_cluster = df_vals_single_cluster.drop(['cluster_ids', 'num_cluster_ids'], axis=1)
df_vals_single_cluster

In [None]:
df_compare = df_vals_single_cluster.set_index('cluster_id').join(cluster_df.set_index('cluster_id'), on='cluster_id')
df_compare

In [None]:
df_compare['diff'] = df_compare.apply(lambda x: x['num_ids'] - x['num_ids_in_match_group'], axis=1)
df_compare

## Precision & Recall
- True positives (TP): Correctly declared duplicates
- False positives (FP): Incorrectly declared duplicates
- True negatives (TN): Correctly avoided pairs
- False negatives (FN): Missed duplicates
- Precision = TP / (TP + FP)
- Recall = TP / (TP + FN)

## Calculate Classification Stats
This is really a binary classification problem, where the classes are simply "Match" and "Unmatched". First we'll calculate the TP, FP, FN and TN. That will form our confusion matrix which we can then show. And after that we can calculate the precision, recall and F1-score.

In [None]:
df_tp = df_compare[df_compare['diff'] == 0].aggregate({'num_ids_in_match_group': 'sum'}).reset_index()
TP = df_tp.iloc[0, 1]
print(f'TP={TP:,}')

In [None]:
df_fp = df_compare[df_compare['diff'] != 0].aggregate({'num_ids': 'sum'}).reset_index()
FP = df_fp.iloc[0, 1]
print(f'FP={FP:,}')

In [None]:
FN = tot_possible_dups - (TP+FP)
print(f'FN={FN:,}')

In [None]:
TN = tot_possible_non_dups
print(f'TN={TN:,}')

In [None]:
cf_matrix = np.array([[TN, FP], [FN, TP]])

In [None]:
group_names = ['TN', 'FP', 'FN', 'TP']
group_counts = [f'{value:,}' for value in cf_matrix.flatten()]
group_percentages = [f'{value:.2%}' for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues', cbar_kws={'format': lambda x, _: f'{x:,.0f}'})
plt.show()

In [None]:
precision = TP/(TP+FP)
print(f'Precision: {precision:.2f}')

In [None]:
recall = TP/(TP+FN)
print(f'Recall: {recall:.2f}')

In [None]:
f_score = (precision * recall) / (precision + recall) * 2
print(f'F-Score: {f_score:.2f}')