# Clusters comparison based on Heatmaps

In [None]:
import numpy as np
from sklearn.metrics import (adjusted_mutual_info_score, adjusted_rand_score,
                           confusion_matrix, fowlkes_mallows_score)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import make_blobs

cluster_methods = [
    'kmeans 3 emb', 'kmeans 3 cat',
    'agg 3 emb', 'agg 3 cat',
    'gmm 3 emb', 'gmm 3 cat',
    'birch 3 emb', 'birch 3 cat'
]


cluster_names = ["Novice", "Developing", "Proficient",
                 "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]

## Medical specialist's Cluster vs BioBERT

In [None]:
dfm1 = pd.read_csv('medical_specialist/annotations-dpoc-medical_specialist_metrics_84.csv')
dfm2 = pd.read_csv('biobert_balanced/annotations-dpoc-biobert_metrics.csv')

# Ensure that both dataframes have the same annotations and are sorted
dfm1 = dfm1.sort_values('annotation id')
dfm2 = dfm2.sort_values('annotation id')

# Keep only annotations that appear in both files
common_ids = dfm1['annotation id'].isin(dfm2['annotation id'])
dfm1 = dfm1[common_ids]
dfm2 = dfm2[common_ids]

fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))

for idx, cc in enumerate(cluster_methods):
  column_name = f'cluster {cc}'

  cluster1_labels = dfm1[column_name].values
  cluster2_labels = dfm2[column_name].values

  # Create contingency matrix
  contingency = confusion_matrix(cluster1_labels, cluster2_labels)

  # Calculate proportion matrix
  row_sums = contingency.sum(axis=1).reshape(-1, 1)
  proportion_matrix = contingency / row_sums

  # Plot contingency matrix heatmap
  ax = axes[idx // 4, idx % 4]
  sns.heatmap(proportion_matrix, annot=True, fmt='.2f', cmap='YlOrRd',
        xticklabels=[f'BERT {cluster_names[i]}' for i in range(len(np.unique(cluster2_labels)))],
        yticklabels=[f'Human {cluster_names[i]}' for i in range(len(np.unique(cluster1_labels)))],
        vmin=0, vmax=1, ax=ax)
  ax.set_title(f'Proportion Matrix for {cc}')

plt.tight_layout()
plt.show()

## Medical specialist's Cluster vs BioBERT-Llama

In [None]:

dfm1 = pd.read_csv('medical_specialist/annotations-dpoc-medical_specialist_metrics_84.csv')
dfm2 = pd.read_csv('biobert-llama_balanced/annotations-dpoc-biobert-llama_metrics.csv')

# Ensure that both dataframes have the same annotations and are sorted
dfm1 = dfm1.sort_values('annotation id')
dfm2 = dfm2.sort_values('annotation id')

# Keep only annotations that appear in both files
common_ids = dfm1['annotation id'].isin(dfm2['annotation id'])
dfm1 = dfm1[common_ids]
dfm2 = dfm2[common_ids]

fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))

for idx, cc in enumerate(cluster_methods):
  column_name = f'cluster {cc}'

  cluster1_labels = dfm1[column_name].values
  cluster2_labels = dfm2[column_name].values

  # Create contingency matrix
  contingency = confusion_matrix(cluster1_labels, cluster2_labels)

  # Calculate proportion matrix
  row_sums = contingency.sum(axis=1).reshape(-1, 1)
  proportion_matrix = contingency / row_sums

  # Plot contingency matrix heatmap
  ax = axes[idx // 4, idx % 4]
  sns.heatmap(proportion_matrix, annot=True, fmt='.2f', cmap='YlOrRd',
        xticklabels=[f'BERT {cluster_names[i]}' for i in range(len(np.unique(cluster2_labels)))],
        yticklabels=[f'Human {cluster_names[i]}' for i in range(len(np.unique(cluster1_labels)))],
        vmin=0, vmax=1, ax=ax)
  ax.set_title(f'Proportion Matrix for {cc}')

plt.tight_layout()
plt.show()

In [None]:
cluster_methods = [
    'kmeans 3 cat'
]

## Medical specialist's Cluster vs Llama

In [None]:
import os

# dfm2 = pd.read_csv('llama/aligned_annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv')

dfm1 = pd.read_csv('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv')
fig, axes = plt.subplots(nrows=11, ncols=4, figsize=(20, 40))
count = 0
fn = sorted(os.listdir('llama'))
list_of_approaches = ["1_tf_idf_shot","3_static_shot","4_static_shot","4_tf_idf_shot_aug","10_tf_idf_shot_aug"]
for filename in fn:
  if filename.startswith('aligned_annotations-dpoc-llm') and filename.endswith('_metrics.csv'):
    approach = filename.split('aligned_annotations-dpoc-llm_')[1].split('_metrics.csv')[0]
    if approach != '':
      dfm2 = pd.read_csv(f'llama/{filename}')
      # Ensure that both dataframes have the same annotations and are sorted
      dfm1 = dfm1.sort_values('annotation id')
      dfm2 = dfm2.sort_values('annotation id')

      # Keep only annotations that appear in both files
      common_ids = dfm1['annotation id'].isin(dfm2['annotation id'])
      dfm1 = dfm1[common_ids]
      dfm2 = dfm2[common_ids]

      # fig.suptitle(approach)


      for idx, cc in enumerate(cluster_methods):
        column_name = f'cluster {cc}'

        cluster1_labels = dfm1[column_name].values
        cluster2_labels = dfm2[column_name].values

        # Create contingency matrix
        contingency = confusion_matrix(cluster1_labels, cluster2_labels)

        # Calculate proportion matrix
        row_sums = contingency.sum(axis=1).reshape(-1, 1)
        proportion_matrix = contingency / row_sums

        # Plot contingency matrix heatmap
        ax = axes[count // 4, count % 4]
        sns.heatmap(proportion_matrix, annot=True, fmt='.2f', cmap='YlOrRd',
              xticklabels=[f'Llama {cluster_names[i]}' for i in range(len(np.unique(cluster2_labels)))],
              yticklabels=[f'Human {cluster_names[i]}' for i in range(len(np.unique(cluster1_labels)))],
              vmin=0, vmax=1, ax=ax)
        diagonal_values = np.diag(proportion_matrix)
        ax.set_title(approach + f' {np.sum(diagonal_values):.2f}')
        # sum_of_values = np.sum(proportion_matrix)
        # sum_of_values

        count += 1

plt.tight_layout()
plt.show()

In [None]:
## Plotting the best approaches for each subgroup according to F1 mean metric

import os

# dfm2 = pd.read_csv('llama/aligned_annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv')

dfm1 = pd.read_csv('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv')
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 15))
count = 0
fn = sorted(os.listdir('llama'))
best_approaches = ["0_shot_aug","0_shot","10_random_shot_aug","10_tf_idf_custom_shot_aug","10_static_shot_aug","10_random_shot","10_tf_idf_shot_aug","10_tf_idf_shot","10_tf_idf_custom_shot"]

for filename in fn:
  if filename.startswith('aligned_annotations-dpoc-llm') and filename.endswith('_metrics.csv'):
    approach = filename.split('aligned_annotations-dpoc-llm_')[1].split('_metrics.csv')[0]
    if approach in best_approaches:
      dfm2 = pd.read_csv(f'llama/{filename}')
      # Ensure that both dataframes have the same annotations and are sorted
      dfm1 = dfm1.sort_values('annotation id')
      dfm2 = dfm2.sort_values('annotation id')

      # Keep only annotations that appear in both files
      common_ids = dfm1['annotation id'].isin(dfm2['annotation id'])
      dfm1 = dfm1[common_ids]
      dfm2 = dfm2[common_ids]

      # fig.suptitle(approach)


      for idx, cc in enumerate(cluster_methods):
        column_name = f'cluster {cc}'

        cluster1_labels = dfm1[column_name].values
        cluster2_labels = dfm2[column_name].values

        # Create contingency matrix
        contingency = confusion_matrix(cluster1_labels, cluster2_labels)

        # Calculate proportion matrix
        row_sums = contingency.sum(axis=1).reshape(-1, 1)
        proportion_matrix = contingency / row_sums

        # Plot contingency matrix heatmap
        ax = axes[count // 4, count % 4]
        sns.heatmap(proportion_matrix, annot=True, fmt='.2f', cmap='YlOrRd',
              xticklabels=[f'Llama {cluster_names[i]}' for i in range(len(np.unique(cluster2_labels)))],
              yticklabels=[f'Human {cluster_names[i]}' for i in range(len(np.unique(cluster1_labels)))],
              vmin=0, vmax=1, ax=ax)
        diagonal_values = np.diag(proportion_matrix)
        ax.set_title(approach + f' {np.sum(diagonal_values):.2f}')
        # sum_of_values = np.sum(proportion_matrix)
        # sum_of_values

        count += 1

plt.tight_layout()
plt.show()


## Medical specialist's Cluster vs Random

In [None]:
dfm1 = pd.read_csv('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv')

fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))

for idx, cc in enumerate(cluster_methods):
  column_name = f'cluster {cc}'

  cluster1_labels = dfm1[column_name].values
  cluster2_labels = make_blobs(n_samples=len(cluster1_labels), centers=np.unique(cluster1_labels).shape[0], n_features=2, random_state=42)[1]

  # Create contingency matrix
  contingency = confusion_matrix(cluster1_labels, cluster2_labels)

  # Calculate proportion matrix
  row_sums = contingency.sum(axis=1).reshape(-1, 1)
  proportion_matrix = contingency / row_sums

  # Plot contingency matrix heatmap
  ax = axes[idx // 4, idx % 4]
  sns.heatmap(proportion_matrix, annot=True, fmt='.2f', cmap='YlOrRd',
        xticklabels=[f'Random {cluster_names[i]}' for i in range(len(np.unique(cluster2_labels)))],
        yticklabels=[f'Human {cluster_names[i]}' for i in range(len(np.unique(cluster1_labels)))],
        vmin=0, vmax=1, ax=ax)
  ax.set_title(f'Proportion Matrix for {cc}')

plt.tight_layout()
plt.show()