# Clustering Computing and Comparison

## Part 1 - Clustering

In [None]:
import sys
import os
import pandas as pd
import gc

sys.path.append("../../annotations")
import annotation_metrics as am

### Baselines - Medical specialist's annotations with full and partial samples

In [None]:
am.annotation_metrics('medical_specialist/annotations-dpoc-medical_specialist_groups_435.csv',
                      'annotations-dpoc-medical_specialist_scores.csv',
                      'medical_specialist/annotations-dpoc-medical_specialist_metrics_435.csv',
                      'medical_specialist/annotations-dpoc-medical_specialist_summary_435.csv',
                      'medical_specialist/annotations-dpoc-medical_specialist_stats_435.csv')
gc.collect()

In [None]:
# fer =  pd.read_csv('medical_specialist/annotations-dpoc-medical_specialist_groups_435.csv')
# bio =  pd.read_csv('biobert_balanced/annotations-dpoc-biobert_groups.csv')

# new_fer = fer[fer["annotation id"].isin(bio["annotation id"])]
# new_fer.to_csv('medical_specialist/annotations-dpoc-medical_specialist_groups_84.csv', index=False)

In [None]:
am.annotation_metrics('medical_specialist/annotations-dpoc-medical_specialist_groups_84.csv', 
                      'annotations-dpoc-medical_specialist_scores.csv',
                      'medical_specialist/annotations-dpoc-medical_specialist_metrics_84.csv', 
                      'medical_specialist/annotations-dpoc-medical_specialist_summary_84.csv',
                      'medical_specialist/annotations-dpoc-medical_specialist_stats_84.csv')
gc.collect()

### BERT-based Annotations

In [None]:
am.annotation_metrics('biobert_balanced/annotations-dpoc-biobert_groups.csv', 
                      'annotations-dpoc-medical_specialist_scores.csv',
                      'biobert_balanced/annotations-dpoc-biobert_metrics.csv', 
                      'biobert_balanced/annotations-dpoc-biobert_summary.csv',
                      'biobert_balanced/annotations-dpoc-biobert_stats.csv')
gc.collect()

### BERT-LLama-based Annotations

In [None]:
am.annotation_metrics('biobert-llama_balanced/annotations-dpoc-biobert-llama_groups.csv', 
                      'annotations-dpoc-medical_specialist_scores.csv',
                      'biobert-llama_balanced/annotations-dpoc-biobert-llama_metrics.csv', 
                      'biobert-llama_balanced/annotations-dpoc-biobert-llama_summary.csv',
                      'biobert-llama_balanced/annotations-dpoc-biobert-llama_stats.csv')
gc.collect()

### Llama-based Annotations

In [None]:
directory = '../../llm-medical_specialist_/self-order-and-clustering'
for filename in os.listdir(directory):
    if filename.startswith('teste_progresso_self_order_groups_'):
    # if filename.startswith('teste_progresso_self_order_groups_10_tf_idf_custom_shot'):
        shot = filename[len('teste_progresso_self_order_groups_'):-4]
        filepath = os.path.join(directory, filename)
        if shot == '4_static_shot':
            
            am.annotation_metrics(
                filepath,
                'annotations-dpoc-medical_specialist_scores.csv',
                f'llama/annotations-dpoc-llm_{shot}_metrics.csv',
                f'llama/annotations-dpoc-llm_{shot}_summary.csv',
                f'llama/annotations-dpoc-llm_{shot}_stats.csv')
            gc.collect()
            print(shot, ' complete')
            
    # if filename.startswith('teste_progresso_self_order_groups_augmented_10_tf_idf_custom_shot'):
    #     shot = filename[len('teste_progresso_self_order_groups_augmented_'):-4]
    #     filepath = os.path.join(directory, filename)
    #     am.annotation_metrics(
    #         filepath,
    #         'annotations-dpoc-medical_specialist_scores.csv',
    #         f'llama/annotations-dpoc-llm_augmented_{shot}_metrics.csv',
    #         f'llama/annotations-dpoc-llm_augmented_{shot}_summary.csv',
    #         f'llama/annotations-dpoc-llm_augmented_{shot}_stats.csv')
    #     gc.collect()

#### Llama-based annotation (using only Gabriel's BioBERT 86 IDs)

In [None]:
# 86_filtered_biobert_test_ids
directory = '../../llm-medical_specialist_/self-order-and-clustering'
for filename in os.listdir(directory):
    if filename.startswith('teste_progresso_self_order_groups_'):
        shot = filename[len('teste_progresso_self_order_groups_'):-4]
        filepath = os.path.join(directory, filename)
        
        groups_86 = pd.read_csv('biobert_balanced/annotations-dpoc-biobert_groups.csv')
        groups_llm = pd.read_csv(filepath)

        groups_86.set_index('annotation id', inplace=True)
        groups_llm.set_index('annotation id', inplace=True)

        # Align the dataframes by a common key or index if necessary
        aligned_groups_86, aligned_groups_llm = groups_86.align(groups_llm, join='inner', axis=0)
        # Save the aligned dataframes back to CSV

        aligned_groups_86.reset_index(inplace=True)
        aligned_groups_llm.reset_index(inplace=True)
        
        aligned_groups_llm.to_csv(f'{directory}/86_filtered_biobert_test_ids/teste_progresso_self_order_groups_{shot}.csv', index=False)
        filepath_86 = f'{directory}/86_filtered_biobert_test_ids/teste_progresso_self_order_groups_{shot}.csv'
        am.annotation_metrics(
            filepath_86,
            'annotations-dpoc-medical_specialist_scores.csv',
            f'llama/86_filtered/annotations-dpoc-llm_{shot}_metrics.csv',
            f'llama/86_filtered/annotations-dpoc-llm_{shot}_summary.csv',
            f'llama/86_filtered/annotations-dpoc-llm_{shot}_stats.csv')
        gc.collect()

## Part 2 - Cluster Comparison

### BERT-based Annotations vs Baseline

In [None]:
am.compare_clusters('medical_specialist/annotations-dpoc-medical_specialist_metrics_84.csv',
                    'biobert_balanced/annotations-dpoc-biobert_metrics.csv',
                    'medical_specialist/annotations-dpoc-medical_specialist_summary_84.csv',
                    'biobert_balanced/annotations-dpoc-biobert_summary.csv',
                    'biobert_balanced/annotations-dpoc-biobert_comparison.csv')

### BERT-LLama-based Annotations vs Baseline

In [None]:

am.compare_clusters('medical_specialist/annotations-dpoc-medical_specialist_metrics_84.csv',
                    'biobert-llama_balanced/annotations-dpoc-biobert-llama_metrics.csv',
                    'medical_specialist/annotations-dpoc-medical_specialist_summary_84.csv',
                    'biobert-llama_balanced/annotations-dpoc-biobert-llama_summary.csv',
                    'biobert-llama_balanced/annotations-dpoc-biobert-llama_comparison.csv')

### Llama-based Annotations vs Baseline

In [None]:
import pandas as pd
import os
# Ensure the input files have the same number of samples
# metrics_435 = pd.read_csv('medical_specialist/annotations-dpoc-medical_specialist_metrics_435.csv')
# demo_metrics_llm = pd.read_csv(f'llama/annotations-dpoc-llm_10_tf_idf_shot_metrics.csv')
# metrics_435.set_index('annotation id', inplace=True)
# demo_metrics_llm.set_index('annotation id', inplace=True)
# aligned_metrics_435, aligned_metrics_llm = metrics_435.align(demo_metrics_llm, join='inner', axis=0)
# aligned_metrics_435.reset_index(inplace=True)
# aligned_metrics_435.to_csv('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv', index=False)

metrics_llm_files = [f for f in os.listdir('llama') if f.startswith('annotations-dpoc-llm') and f.endswith('_metrics.csv')]
for metrics_llm_file in metrics_llm_files:
    metrics_435 = pd.read_csv('medical_specialist/annotations-dpoc-medical_specialist_metrics_435.csv')
    metrics_llm = pd.read_csv(f'llama/{metrics_llm_file}')
    metrics_435.set_index('annotation id', inplace=True)
    metrics_llm.set_index('annotation id', inplace=True)
    aligned_metrics_435, aligned_metrics_llm = metrics_435.align(metrics_llm, join='inner', axis=0)
    aligned_metrics_435.reset_index(inplace=True)
    aligned_metrics_llm.reset_index(inplace=True)
    aligned_metrics_llm.to_csv(f'llama/aligned_{metrics_llm_file}', index=False)

# Save the aligned dataframes back to CSV

for filename in os.listdir('llama'):
    if filename.startswith('aligned_annotations-dpoc-llm') and filename.endswith('_metrics.csv'):
        approach = filename.split('aligned_annotations-dpoc-llm_')[1].split('_metrics.csv')[0]
        summary_filename = f'llama/annotations-dpoc-llm_{approach}_summary.csv'
        comparison_filename = f'llama/annotations-dpoc-llm_{approach}_comparison.csv'

        print(f"Comparing approach: {filename}")
        test_content = pd.read_csv(f'llama/{filename}')
        medical_specialist_test_content = pd.read_csv('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv')
        # Check if the order of annotation ids is the same in both dataframes
        differences = medical_specialist_test_content[~medical_specialist_test_content['annotation id'].isin(test_content['annotation id'])]
        print(differences['annotation id'])
        # Find the specific annotation id in test_content
        annotation_id = '4e93adbe-99a4-4f09-b6a6-858bc016a5e5'
        specific_row = test_content[test_content['annotation id'] == annotation_id]
        # print(f"Size of medical_specialist_test_content: {medical_specialist_test_content.shape}")
        # print(f"Size of test_content: {test_content.shape}")
        am.compare_clusters('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv',
                                  f'llama/{filename}',
                                  'medical_specialist/annotations-dpoc-medical_specialist_summary_435.csv',
                                  summary_filename,
                                  comparison_filename)
    # if filename.startswith('annotations-dpoc-llm_augmented_10_tf_idf_custom_shot') and filename.endswith('_metrics.csv'):
    #     approach = filename.split('annotations-dpoc-llm_augmented_')[1].split('_metrics.csv')[0]
    #     summary_filename = f'llama/annotations-dpoc-llm_augmented_{approach}_summary.csv'
    #     comparison_filename = f'llama/annotations-dpoc-llm_augmented_{approach}_comparison.csv'

    #     print(f"Comparing approach: Augmented {approach}")
    #     print(am.compare_clusters('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv',
    #                               f'llama/{filename}',
    #                               'medical_specialist/annotations-dpoc-medical_specialist_summary_435.csv',
    #                               summary_filename,
    #                               comparison_filename))

# # Compare the aligned clusters
# am.compare_clusters('medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv',
#                     'llama/aligned_annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv',
#                     'cluster 5')

#### Llama vs Baseline (Using only 86's Gabriel's test data)

In [None]:
for filename in os.listdir('llama/86_filtered'):
    if filename.startswith('annotations-dpoc-llm_') and filename.endswith('_metrics.csv'):
        approach = filename.split('annotations-dpoc-llm_')[1].split('_metrics.csv')[0]
        summary_filename = f'llama/86_filtered/annotations-dpoc-llm_{approach}_summary.csv'
        comparison_filename = f'llama/86_filtered/annotations-dpoc-llm_{approach}_comparison.csv'

        print(f"Comparing approach: {approach}")
        print(am.compare_clusters('medical_specialist/annotations-dpoc-medical_specialist_metrics_86.csv',
                                  f'llama/86_filtered/{filename}',
                                  'medical_specialist/annotations-dpoc-medical_specialist_summary_86.csv',
                                  summary_filename,
                                  comparison_filename))

## Part 3 - Calculating Distance and Mapping Clusters

In [None]:
from scipy.spatial.distance import cdist
import pandas as pd
import numpy as np


medical_specialist_435 = pd.read_csv('medical_specialist/annotations-dpoc-medical_specialist_stats_435.csv')
medical_specialist_86 = pd.read_csv('medical_specialist/annotations-dpoc-medical_specialist_stats_86.csv')

medical_specialist_435_kmeans_5 = medical_specialist_435[medical_specialist_435['method'] == 'kmeans 5']
medical_specialist_86_kmeans_5 = medical_specialist_86[medical_specialist_86['method'] == 'kmeans 5']

medical_specialist_categories_435_kmeans_5_cat = medical_specialist_435_kmeans_5.filter(regex='mean$').iloc[:, :7]
medical_specialist_categories_86_kmeans_5_cat = medical_specialist_86_kmeans_5.filter(regex='mean$').iloc[:, :7]


def match_clusters(df1, df2, distance_metric='euclidean'):
    # Convert dataframes to numpy arrays
    arr1 = df1.to_numpy()
    arr2 = df2.to_numpy()
    
    # Calculate distances between all pairs of rows
    distances = cdist(arr1, arr2, metric=distance_metric)
    
    matches = []
    used_rows_df1 = set()
    used_rows_df2 = set()
    
    while len(used_rows_df1) < len(df1) or len(used_rows_df2) < len(df2):
        # Find the minimum distance
        min_distance = np.inf
        min_i, min_j = -1, -1
        
        for i in range(len(df1)):
            if i in used_rows_df1:
                continue
            for j in range(len(df2)):
                if j in used_rows_df2:
                    continue
                if distances[i, j] < min_distance:
                    min_distance = distances[i, j]
                    min_i, min_j = i, j
        
        # If no more matches can be made, break the loop
        if min_i == -1 or min_j == -1:
            break
        
        # Add the match to the list
        matches.append((min_i, min_j, min_distance))
        
        # Mark the rows as used
        used_rows_df1.add(min_i)
        used_rows_df2.add(min_j)
        
        # Check if all rows from df1 have been used
        if len(used_rows_df1) == len(df1):
            break
    
    return matches

matches = match_clusters(medical_specialist_categories_435_kmeans_5_cat, medical_specialist_categories_86_kmeans_5_cat)

print("Matches (df1_index, df2_index, distance):")
for match in matches:
    print(f"Row {match[0]} in df1 matched with Row {match[1]} in df2 (distance: {match[2]:.2f})")

### BioBERT matching Medical specialist's Clusters (Kmeans 5)

In [None]:
biobert_86 = pd.read_csv('biobert/annotations-dpoc-biobert_stats_86.csv')
biobert_categories_86_kmeans_5 = biobert_86[biobert_86['method'] == 'kmeans 5']

biobert_categories_86_kmeans_5_cat = biobert_categories_86_kmeans_5.filter(regex='mean$').iloc[:, :7]

biobert_matches = match_clusters(medical_specialist_categories_86_kmeans_5_cat, biobert_categories_86_kmeans_5_cat)

print("Matches (df1_index, df2_index, distance):")
for match in biobert_matches:
    print(f"Row {match[0]} in df1 matched with Row {match[1]} in df2 (distance: {match[2]:.2f})")

In [None]:
biobert_86 = pd.read_csv('biobert/annotations-dpoc-biobert_stats_86.csv')
biobert_categories_86_kmeans_5 = biobert_86[biobert_86['method'] == 'kmeans n']

biobert_categories_86_kmeans_5_cat = biobert_categories_86_kmeans_5.filter(regex='mean$').iloc[:, :7]

biobert_matches = match_clusters(medical_specialist_categories_86_kmeans_5_cat, biobert_categories_86_kmeans_5_cat)

print("Matches (df1_index, df2_index, distance):")
for match in biobert_matches:
    print(f"Row {match[0]} in df1 matched with Row {match[1]} in df2 (distance: {match[2]:.2f})")

### Llama matching Medical specialist's Clusters

In [None]:
def gen_distance_matches(df_medical_specialist, df_comparison, distance_metric='euclidean'):
    
    methods = df_medical_specialist['method'].unique()
    matches = pd.DataFrame(columns=['cluster', 'method'])
    
    for method in methods:
        df_medical_specialist_method = df_medical_specialist[df_medical_specialist['method'] == method]
        df_comparison_method = df_comparison[df_comparison['method'] == method]
        df_medical_specialist_cat = df_medical_specialist_method.filter(regex='norm_mean$').iloc[:, :8]
        df_comparison_cat = df_comparison_method.filter(regex='norm_mean$').iloc[:, :8].reset_index(drop=True)
        method_matches = match_clusters(df_medical_specialist_cat, df_comparison_cat, distance_metric)
        if 'cat_distribution_distance' not in df_comparison_cat.columns:
                df_comparison_cat['cat_distribution_distance'] = np.nan
                df_comparison_cat['medical_specialist_respective_cluster'] = np.nan
        for match in method_matches:
            df_comparison_cat.at[match[1], 'cat_distribution_distance'] = match[2]
            df_comparison_cat.at[match[1], 'medical_specialist_respective_cluster'] = match[0]
        df_comparison_cat['method'] = method
        df_comparison_cat['cluster'] = df_comparison_cat.index
        df_comparison_cat = df_comparison_cat[['method', 'cluster', 'cat_distribution_distance', 'medical_specialist_respective_cluster']]
        matches = pd.concat([matches, df_comparison_cat], ignore_index=True)
    return matches

llama_categories_10 = pd.read_csv('llama/annotations-dpoc-llm_10_tf_idf_custom_shot_stats.csv')

distance_matches = gen_distance_matches(medical_specialist_435, llama_categories_10)

llama_categories_10 = llama_categories_10.merge(distance_matches, on=['method','cluster'], how='left', suffixes=('', '_drop'))
llama_categories_10.drop([col for col in llama_categories_10.columns if 'drop' in col], axis=1, inplace=True)
llama_categories_10
llama_categories_10.to_csv('llama/annotations-dpoc-llm_10_tf_idf_custom_shot_stats.csv', index=False)


llama_categories_augmented_10 = pd.read_csv('llama/annotations-dpoc-llm_augmented_10_tf_idf_custom_shot_stats.csv')

distance_matches = gen_distance_matches(medical_specialist_435, llama_categories_augmented_10)

llama_categories_augmented_10 = llama_categories_augmented_10.merge(distance_matches, on=['method','cluster'], how='left', suffixes=('', '_drop'))
llama_categories_augmented_10.drop([col for col in llama_categories_augmented_10.columns if 'drop' in col], axis=1, inplace=True)
llama_categories_augmented_10
llama_categories_augmented_10.to_csv('llama/annotations-dpoc-llm_augmented_10_tf_idf_custom_shot_stats.csv', index=False)



In [None]:
import os
medical_specialist_435 = pd.read_csv('medical_specialist/annotations-dpoc-medical_specialist_stats_435.csv')
directory = 'llama'
for filename in os.listdir(directory):
    if filename.endswith('stats.csv'):
        filepath = os.path.join(directory, filename)
        llama_categories_stats = pd.read_csv(filepath)
        print(f"Processing file: {filename}")
        if any(col.endswith('norm_mean') for col in llama_categories_stats.columns):
            distance_matches = gen_distance_matches(medical_specialist_435, llama_categories_stats)

            merge_on = ['method', 'cluster']
            overlapping_cols = list(set(llama_categories_stats.columns) & set(distance_matches.columns))
            overlapping_cols = [col for col in overlapping_cols if col not in merge_on]

            # Drop overlapping columns from df before merge
            llama_categories_stats.drop(columns=overlapping_cols,inplace=True)

            llama_categories_stats = llama_categories_stats.merge(distance_matches, on=['method', 'cluster'], how='left')
            llama_categories_stats.to_csv(filepath, index=False)
        else:
            print(f"Skipping file: {filename}...no normalized mean columns found")
        # print(f"Processing file: {llama_categories_stats}")
        # Add your processing code here