# Clustering Annotations

In [None]:
import sys
sys.path.append('../../annotations')

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE, Isomap, MDS
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import annotation_metrics as am

compared_methods = [
    'kmeans 3 emb', 'kmeans 3 cat',
    'agg 3 emb', 'agg 3 cat',
    'gmm 3 emb', 'gmm 3 cat',
    'birch 3 emb', 'birch 3 cat'
]

In [None]:
medical_specialist_annotations = pd.read_csv('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Read the CSV file
medical_specialist_annotations = pd.read_csv('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv')

# Select the specified columns
correlation_columns = [
  'pathophysiology', 'epidemiology', 'etiology', 'history',
  'physical', 'exams', 'differential', 'therapeutic',
  'total annotations', 'used categories', 'normalized self order score']

# Compute the correlation matrix
correlation_matrix = medical_specialist_annotations[correlation_columns].corr()

# Create a figure and set its size
plt.figure(figsize=(10, 8))

# Create a heatmap
sns.heatmap(correlation_matrix, 
            annot=True,           # Show numeric values
            cmap='coolwarm',      # Color palette (red to blue)
            center=0,              # Center color at 0
            vmin=-1, 
            vmax=1,                # Set color scale from -1 to 1
            square=True,           # Make the plot square
            linewidths=0.5,        # Add lines between cells
            cbar_kws={"shrink": .8}) # Slightly reduce colorbar size

# Set the title
plt.title('Correlation Heatmap of Annotation Metrics', fontsize=15)

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(10, 16))
axs = axs.flatten()

pos = 0

for method in compared_methods:
    scatter = axs[pos].scatter(medical_specialist_annotations['ideas'], medical_specialist_annotations['normalized self order score'], 
                               c=medical_specialist_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos], label='Cluster')
    axs[pos].set_title(f'Medical specialist - {method} Clusters')
    axs[pos].set_xlabel('ideas')
    axs[pos].set_ylabel('normalized self order score')
    pos += 1

plt.tight_layout()
plt.show()

# Medical Specialist Annotations
## Reading Clusters and Preparing Plot Space

In [None]:
# category_dimensions = annotations[am.categories_labels + ['normalized self order score','ideas']].values
medical_specialist_dimensions = medical_specialist_annotations[am.cluster_labels].values

# t-SNE
medical_specialist_tsne = TSNE(n_components=2, random_state=42, init='pca')
medical_specialist_tsne_results_cat = medical_specialist_tsne.fit_transform(medical_specialist_dimensions)

# PCA
medical_specialist_pca = PCA(n_components=2)
medical_specialist_pca_results_cat = medical_specialist_pca.fit_transform(medical_specialist_dimensions)

# Isomap
medical_specialist_isomap = Isomap(n_components=2)
medical_specialist_isomap_results_cat = medical_specialist_isomap.fit_transform(medical_specialist_dimensions)

# MDS
medical_specialist_mds = MDS(n_components=2, random_state=42)
medical_specialist_mds_results_cat = medical_specialist_mds.fit_transform(medical_specialist_dimensions)

## Clusters in Embedding and Categories Space
### t-SNE, PCA, Isomap, and MDS

In [None]:
fig, axs = plt.subplots(8, 4, figsize=(20, 32))
axs = axs.flatten()

pos = 0

for method in compared_methods:
    scatter = axs[pos].scatter(medical_specialist_tsne_results_cat[:, 0], medical_specialist_tsne_results_cat[:, 1], 
                               c=medical_specialist_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos], label='Cluster')
    axs[pos].set_title(f'Medical specialist - {method} Clusters')
    axs[pos].set_xlabel('t-SNE 1')
    axs[pos].set_ylabel('t-SNE 2')

    scatter = axs[pos+1].scatter(medical_specialist_pca_results_cat[:, 0], medical_specialist_pca_results_cat[:, 1],
                                 c=medical_specialist_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+1], label='Cluster')
    axs[pos+1].set_title(f'Medical specialist - {method} Clusters')
    axs[pos+1].set_xlabel('PCA 1')
    axs[pos+1].set_ylabel('PCA 2')

    scatter = axs[pos+2].scatter(medical_specialist_isomap_results_cat[:, 0], medical_specialist_isomap_results_cat[:, 1], 
                                 c=medical_specialist_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+2], label='Cluster')
    axs[pos+2].set_title(f'Medical specialist - {method} Clusters')
    axs[pos+2].set_xlabel('Isomap 1')
    axs[pos+2].set_ylabel('Isomap 2')

    scatter = axs[pos+3].scatter(medical_specialist_mds_results_cat[:, 0], medical_specialist_mds_results_cat[:, 1],
                                 c=medical_specialist_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+3], label='Cluster')
    axs[pos+3].set_title(f'Medical specialist - {method} Clusters')
    axs[pos+3].set_xlabel('MDS 1')
    axs[pos+3].set_ylabel('MDS 2')

    pos += 4

plt.tight_layout()
plt.show()

# Llama Annotations
## Dimension Reduction Exploration

In [None]:
llama_annotations = pd.read_csv('llama/aligned_annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv')

# llama_dimensions = llama_annotations[am.categories_labels + ['normalized self order score','ideas']].values
llama_dimensions = llama_annotations[am.cluster_labels].values

# t-SNE
llama_tsne = TSNE(n_components=2, random_state=42, init='pca')
llama_tsne_results_cat = llama_tsne.fit_transform(llama_dimensions)

# PCA
llama_pca = PCA(n_components=2)
llama_pca_results_cat = llama_pca.fit_transform(llama_dimensions)

# Isomap
llama_isomap = Isomap(n_components=2)
llama_isomap_results_cat = llama_isomap.fit_transform(llama_dimensions)

# MDS
llama_mds = MDS(n_components=2, random_state=42)
llama_mds_results_cat = llama_mds.fit_transform(llama_dimensions)

## Clusters in Embedding and Categories Space
### t-SNE, PCA, Isomap, and MDS

In [None]:
fig, axs = plt.subplots(8, 4, figsize=(20, 32))
axs = axs.flatten()

pos = 0

for method in compared_methods:
    scatter = axs[pos].scatter(llama_tsne_results_cat[:, 0], llama_tsne_results_cat[:, 1], 
                               c=llama_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos], label='Cluster')
    axs[pos].set_title(f'Llama - {method} Clusters')
    axs[pos].set_xlabel('t-SNE 1')
    axs[pos].set_ylabel('t-SNE 2')

    scatter = axs[pos+1].scatter(llama_pca_results_cat[:, 0], llama_pca_results_cat[:, 1],
                                 c=llama_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+1], label='Cluster')
    axs[pos+1].set_title(f'Llama - {method} Clusters')
    axs[pos+1].set_xlabel('PCA 1')
    axs[pos+1].set_ylabel('PCA 2')

    scatter = axs[pos+2].scatter(llama_isomap_results_cat[:, 0], llama_isomap_results_cat[:, 1], 
                                 c=llama_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+2], label='Cluster')
    axs[pos+2].set_title(f'Llama - {method} Clusters')
    axs[pos+2].set_xlabel('Isomap 1')
    axs[pos+2].set_ylabel('Isomap 2')

    scatter = axs[pos+3].scatter(llama_mds_results_cat[:, 0], llama_mds_results_cat[:, 1],
                                 c=llama_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+3], label='Cluster')
    axs[pos+3].set_title(f'Llama - {method} Clusters')
    axs[pos+3].set_xlabel('MDS 1')
    axs[pos+3].set_ylabel('MDS 2')

    pos += 4

plt.tight_layout()
plt.show()

# Comparison between Medical Specialist and Llama
## Distance Metrics

In [None]:
import numpy as np
from scipy.spatial.distance import euclidean

assert len(medical_specialist_dimensions) == len(llama_dimensions), "Arrays must have the same length"
distances = [euclidean(medical_specialist_dimensions[i], llama_dimensions[i]) for i in range(len(medical_specialist_dimensions))]
average_distance = np.mean(distances)
print(f"Average distance original: {average_distance}")

min_medical_specialist = medical_specialist_dimensions[0]
max_medical_specialist = medical_specialist_dimensions[0]
for i in range(len(medical_specialist_dimensions)):
    min_medical_specialist = np.minimum(min_medical_specialist, medical_specialist_dimensions[i])
    max_medical_specialist = np.maximum(max_medical_specialist, medical_specialist_dimensions[i])
biggest_medical_specialist_distance = euclidean(min_medical_specialist,max_medical_specialist)
print(f"Biggest distance: {biggest_medical_specialist_distance}")

print(f"Proportion of average distance in the space: {average_distance/biggest_medical_specialist_distance}")

medical_specialist_array = np.array(medical_specialist_dimensions)
llama_array = np.array(llama_dimensions)

assert medical_specialist_array.shape == llama_array.shape, "Arrays must have the same shape"

absolute_differences = np.abs(medical_specialist_array - llama_array)

feature_mae = np.mean(absolute_differences, axis=0)
overall_mae = np.mean(absolute_differences)

print("MAE per feature:", feature_mae)
print("Overall MAE:", overall_mae)


## Llama over Medical Specialist Space

In [None]:
fig, axs = plt.subplots(8, 4, figsize=(20, 32))
axs = axs.flatten()

pos = 0

for method in compared_methods:
    scatter = axs[pos].scatter(medical_specialist_tsne_results_cat[:, 0], medical_specialist_tsne_results_cat[:, 1], 
                               c=llama_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos], label='Cluster')
    axs[pos].set_title(f'Llama over Medical specialist - {method}')
    axs[pos].set_xlabel('t-SNE 1')
    axs[pos].set_ylabel('t-SNE 2')

    scatter = axs[pos+1].scatter(medical_specialist_pca_results_cat[:, 0], medical_specialist_pca_results_cat[:, 1],
                                 c=llama_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+1], label='Cluster')
    axs[pos+1].set_title(f'Llama over Medical specialist - {method}')
    axs[pos+1].set_xlabel('PCA 1')
    axs[pos+1].set_ylabel('PCA 2')

    scatter = axs[pos+2].scatter(medical_specialist_isomap_results_cat[:, 0], medical_specialist_isomap_results_cat[:, 1], 
                                 c=llama_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+2], label='Cluster')
    axs[pos+2].set_title(f'Llama over Medical specialist - {method}')
    axs[pos+2].set_xlabel('Isomap 1')
    axs[pos+2].set_ylabel('Isomap 2')

    scatter = axs[pos+3].scatter(medical_specialist_mds_results_cat[:, 0], medical_specialist_mds_results_cat[:, 1],
                                 c=llama_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+3], label='Cluster')
    axs[pos+3].set_title(f'Llama over Medical specialist - {method}')
    axs[pos+3].set_xlabel('MDS 1')
    axs[pos+3].set_ylabel('MDS 2')

    pos += 4

plt.tight_layout()
plt.show()

## Medical Specialist over Llama Space

In [None]:
fig, axs = plt.subplots(8, 4, figsize=(20, 32))
axs = axs.flatten()

pos = 0

for method in compared_methods:
    scatter = axs[pos].scatter(llama_tsne_results_cat[:, 0], llama_tsne_results_cat[:, 1], 
                               c=medical_specialist_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos], label='Cluster')
    axs[pos].set_title(f'Llama - {method} Clusters')
    axs[pos].set_xlabel('t-SNE 1')
    axs[pos].set_ylabel('t-SNE 2')

    scatter = axs[pos+1].scatter(llama_pca_results_cat[:, 0], llama_pca_results_cat[:, 1],
                                 c=medical_specialist_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+1], label='Cluster')
    axs[pos+1].set_title(f'Llama - {method} Clusters')
    axs[pos+1].set_xlabel('PCA 1')
    axs[pos+1].set_ylabel('PCA 2')

    scatter = axs[pos+2].scatter(llama_isomap_results_cat[:, 0], llama_isomap_results_cat[:, 1], 
                                 c=medical_specialist_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+2], label='Cluster')
    axs[pos+2].set_title(f'Llama - {method} Clusters')
    axs[pos+2].set_xlabel('Isomap 1')
    axs[pos+2].set_ylabel('Isomap 2')

    scatter = axs[pos+3].scatter(llama_mds_results_cat[:, 0], llama_mds_results_cat[:, 1],
                                 c=medical_specialist_annotations[f'cluster {method}'], cmap='viridis', alpha=0.6)
    fig.colorbar(scatter, ax=axs[pos+3], label='Cluster')
    axs[pos+3].set_title(f'Llama - {method} Clusters')
    axs[pos+3].set_xlabel('MDS 1')
    axs[pos+3].set_ylabel('MDS 2')

    pos += 4

plt.tight_layout()
plt.show()