In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import ttest_ind

In [None]:
split_path = "/opt/gpudata/mimic-cxr/mimic-cxr-2.0.0-split.csv"
split_df = pd.read_csv(split_path)

unique_study_ids = split_df[split_df["split"] == "validate"]
print(len(unique_study_ids))

In [None]:
cosine_similarity_biovilt_path = "/opt/gpudata/imadejski/search-model/ctds-search-model/data/biovilt_base_model/biovilt_validate_single_pos_queries_mimic/cosine_similarity.csv"
cosine_similarity_biovilt_df = pd.read_csv(cosine_similarity_biovilt_path)

print(len(cosine_similarity_biovilt_df))
print(len(cosine_similarity_biovilt_df["study_id"].unique()))

In [None]:
biovilt_resampling_path = "/opt/gpudata/imadejski/search-model/ctds-search-model/data/biovilt_base_model/biovilt_validate_single_pos_queries_mimic/top_n_accuracy_results_resampling.csv"
biovilt_resampling_df = pd.read_csv(biovilt_resampling_path)

In [None]:
biovilt_resampling_df

In [None]:
biovilt_resampling_df['label'] = biovilt_resampling_df['label'].replace({
        "Enlarged Cardiomediastinum": "Enl Card"
    })

biovilt_resampling_df['embedding_type'] = biovilt_resampling_df['embedding_type'].replace({
        "average_cosine_similarity_max_accuracy": "Study Max, Query Average",
        "average_cosine_similarity_mean_accuracy": "Study Average, Query Average",
        "max_cosine_similarity_max_accuracy": "Study Max, Query Max",
        "max_cosine_similarity_mean_accuracy": "Study Average, Query Max",
    })


plt.figure(figsize=(12, 8))

# Get unique labels and embedding types
labels = biovilt_resampling_df['label'].unique()

embedding_types = biovilt_resampling_df['embedding_type'].unique()
x = range(len(labels))

# Width of each bar
bar_width = 0.2
# Offset for each embedding type
offsets = [bar_width * i for i in range(len(embedding_types))]

# Plot bars for each embedding type
for i, emb_type in enumerate(embedding_types):
    data_for_emb = biovilt_resampling_df[biovilt_resampling_df['embedding_type'] == emb_type]
    means = data_for_emb['mean']
    ci_lower = data_for_emb['mean'] - data_for_emb['ci_lower']
    ci_upper = data_for_emb['ci_upper'] - data_for_emb['mean']

    plt.bar(
        [p + offsets[i] for p in x], 
        means, 
        width=bar_width, 
        label=emb_type, 
        yerr=[ci_lower, ci_upper], 
        capsize=5
    )

# Set labels, title, and legend
plt.xlabel('Labels')
plt.ylabel('Mean Accuracy')
plt.title('BioViL-T Base Model Accuracy by Label and Embedding Type with Resampling')
plt.xticks([p + bar_width for p in x], labels, rotation=90)
plt.legend(title='Embedding Types')

# Save the plot
plt.tight_layout()
plt.show()

In [None]:
biovilt_all_resampling_path = "/opt/gpudata/imadejski/search-model/ctds-search-model/data/biovilt_base_model/biovilt_validate_single_pos_queries_mimic/top_n_accuracy_results_all_resampling.csv"
biovilt_all_resampling_df = pd.read_csv(biovilt_all_resampling_path)

In [None]:
biovilt_all_resampling_df

In [None]:
results = []

# Get unique labels and embedding types
labels = biovilt_all_resampling_df['label'].unique()
embedding_types = biovilt_all_resampling_df['embedding_type'].unique()

combinations =  (("max_cosine_similarity_max_accuracy", "average_cosine_similarity_max_accuracy"),
                ("max_cosine_similarity_max_accuracy", "max_cosine_similarity_mean_accuracy"),
                ("max_cosine_similarity_max_accuracy", "average_cosine_similarity_mean_accuracy"),
                ("average_cosine_similarity_max_accuracy", "max_cosine_similarity_mean_accuracy"),
                ("average_cosine_similarity_max_accuracy", "average_cosine_similarity_mean_accuracy"),
                ("max_cosine_similarity_mean_accuracy", "average_cosine_similarity_mean_accuracy")
)

# Perform t-tests for each label
for label in labels:
    # Filter the data for the current label
    label_data = biovilt_all_resampling_df[biovilt_all_resampling_df['label'] == label]

    # Generate all combinations of embedding types for pairwise comparison
    for emb_type1, emb_type2 in combinations:
        # Filter the data for each embedding type
        data1 = label_data[label_data['embedding_type'] == emb_type1]['value']
        data2 = label_data[label_data['embedding_type'] == emb_type2]['value']

        # Perform the t-test
        t_stat, p_value = ttest_ind(data1, data2, equal_var=False)

        # Store the result
        results.append({
            'Label': label,
            'Embedding Type 1': emb_type1,
            'Embedding Type 2': emb_type2,
            'T-Statistic': t_stat,
            'P-Value': p_value,
            'Bonferroni Corrected P-Value': p_value/6
        })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

In [None]:
results_df

In [None]:
def determine_winner(row):
    if row['Bonferroni Corrected P-Value'] < 0.05:
        if row['T-Statistic'] > 0:
            return row['Embedding Type 1']
        elif row['T-Statistic'] < 0:
            return row['Embedding Type 2']
    return None

# Apply the function to determine winners for each row
results_df['Winner'] = results_df.apply(determine_winner, axis=1)

# Count the number of wins for each embedding type
win_counts = results_df['Winner'].value_counts()

# Display the win counts
print(win_counts)

In [None]:
biovilt_true_igl_tgl_resampling_path = "/opt/gpudata/imadejski/search-model/ctds-search-model/data/param_search_v3_biovilt/model_run_0_True_igl_tgl/top_n_accuracy_results_resampling.csv"
biovilt_true_igl_tgl_resampling_df = pd.read_csv(biovilt_true_igl_tgl_resampling_path)

biovilt_true_igl_tg_resampling_path = "/opt/gpudata/imadejski/search-model/ctds-search-model/data/param_search_v3_biovilt/model_run_1_True_igl_tg/top_n_accuracy_results_resampling.csv"
biovilt_true_igl_tg_resampling_df = pd.read_csv(biovilt_true_igl_tg_resampling_path)

biovilt_true_ig_tgl_resampling_path = "/opt/gpudata/imadejski/search-model/ctds-search-model/data/param_search_v3_biovilt/model_run_2_True_ig_tgl/top_n_accuracy_results_resampling.csv"
biovilt_true_ig_tgl_resampling_df = pd.read_csv(biovilt_true_ig_tgl_resampling_path)

biovilt_true_ig_tg_resampling_path = "/opt/gpudata/imadejski/search-model/ctds-search-model/data/param_search_v3_biovilt/model_run_3_True_ig_tg/top_n_accuracy_results_resampling.csv"
biovilt_true_ig_tg_resampling_df = pd.read_csv(biovilt_true_ig_tg_resampling_path)

In [None]:
# Concatenate the dataframes into one, adding a column to indicate the model
biovilt_resampling_df['model'] = 'Base'
biovilt_true_ig_tg_resampling_df['model'] = 'IG_TG'
biovilt_true_igl_tg_resampling_df['model'] = 'IGL_TG'
biovilt_true_ig_tgl_resampling_df['model'] = 'IG_TGL'
biovilt_true_igl_tgl_resampling_df['model'] = 'IGL_TGL'

models_df = pd.concat([biovilt_resampling_df, 
                biovilt_true_ig_tg_resampling_df,
                biovilt_true_igl_tg_resampling_df, 
                biovilt_true_ig_tgl_resampling_df, 
                biovilt_true_igl_tgl_resampling_df], 
                ignore_index=True)

# Filter for rows where embedding_type is 'average_cosine_similarity_mean_accuracy'
filtered_df = models_df[models_df['embedding_type'] == 'average_cosine_similarity_mean_accuracy']

filtered_df['label'] = filtered_df['label'].replace({
        "Enlarged Cardiomediastinum": "Enl Card"
    })

labels = filtered_df['label'].unique()
models = filtered_df['model'].unique()

# Create the bar plot
plt.figure(figsize=(12, 6))
bar_width = 0.15
positions = range(len(labels))

# Plot bars for each model within each label group
for idx, model in enumerate(models):
    model_data = filtered_df[filtered_df['model'] == model]
    print(model)
    plt.bar(
        [p + bar_width * idx for p in positions],
        model_data['mean'],
        width=bar_width,
        yerr=[model_data['mean'] - model_data['ci_lower'], model_data['ci_upper'] - model_data['mean']],
        capsize=5,
        label=model
    )

# Add labels and title
plt.xlabel('Labels')
plt.ylabel('Mean Accuracy')
plt.title('Mean Accuracy for Different Models by Label')
plt.xticks([p + bar_width * (len(models) - 1) / 2 for p in positions], labels, rotation=90)

# Add legend
plt.legend(title='Models', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout to make room for rotated x labels
plt.tight_layout()

# Show the plot
plt.show()