In [None]:
import pandas as pd

In [None]:
cosine_path = "/home/imadejski/ctds-search-model/data/multiple_pos_cosine_similarity_new2.csv"
cosine_df = pd.read_csv(cosine_path)

labels_path = "/opt/gpudata/mimic-cxr/mimic-cxr-2.0.0-chexpert.csv.gz"
labels_df = pd.read_csv(labels_path)

split_path = "/opt/gpudata/mimic-cxr/mimic-cxr-2.0.0-split.csv.gz"
split_df = pd.read_csv(split_path)

valid_df = split_df[split_df['split'] == 'validate']
valid_study_ids = valid_df['study_id'].unique()
valid_labels_df = labels_df[labels_df['study_id'].isin(valid_study_ids)]

In [None]:
labels = [
        "Atelectasis",
        "Cardiomegaly",
        "Consolidation",
        "Edema",
        "Enlarged Cardiomediastinum",
        "Fracture",
        "Lung Lesion",
        "Lung Opacity",
        "No Finding",
        "Pleural Effusion",
        "Pleural Other",
        "Pneumonia",
        "Pneumothorax",
        "Support Devices"
    ]
    
embedding_types = [
        'cosine_similarity_1',
        'cosine_similarity_2',
        'cosine_similarity_3',
        'cosine_similarity_4',
        'average_embedding',
        'average_cosine_similarity',
        'max_cosine_similarity'
    ]

In [None]:
data = []
for label in labels:
    for emb in embedding_types:
        column_name = f'{label} {emb}'
        if column_name in cosine_df.columns:
            temp_df = cosine_df[['subject_id', 'study_id', 'dicom_id', column_name]].copy()
            temp_df.rename(columns={column_name: 'cosine_similarity'}, inplace=True)
            temp_df['label'] = label
            temp_df['embedding_type'] = emb
            temp_df['cosine_similarity'] = pd.to_numeric(temp_df['cosine_similarity'], errors='coerce')
            data.append(temp_df)
        else:
            print(f"Warning: Column {column_name} does not exist in cosine_df")

if data:
    cosine_df_transformed = pd.concat(data, ignore_index=True)
else:
    print("No data to concatenate. Please check the input DataFrame and configurations.")

In [None]:
n_counts = {}

for label in labels:
    label_positives = valid_labels_df[valid_labels_df[label] == 1]
    unique_positive_study_ids = label_positives['study_id'].unique()
    n_counts[label] = len(unique_positive_study_ids)

print(n_counts)

In [None]:
def aggregate_cosine(df, method):
    if method == 'max':
        return df.groupby(['study_id', 'label', 'embedding_type'])['cosine_similarity'].max().reset_index()
    elif method == 'mean':
        return df.groupby(['study_id', 'label', 'embedding_type'])['cosine_similarity'].mean().reset_index()

cosine_max = aggregate_cosine(cosine_df_transformed, 'max')
cosine_mean = aggregate_cosine(cosine_df_transformed, 'mean')

In [None]:
results = {}
for label in labels:
    n = n_counts[label]  # Retrieve the count of positives for the current label
    for emb in embedding_types:
        max_df = cosine_max[(cosine_max['label'] == label) & (cosine_max['embedding_type'] == emb)]
        mean_df = cosine_mean[(cosine_mean['label'] == label) & (cosine_mean['embedding_type'] == emb)]

        top_n_max = max_df.nlargest(n, 'cosine_similarity')['study_id']
        top_n_mean = mean_df.nlargest(n, 'cosine_similarity')['study_id']

        label_positives = labels_df[labels_df[label] == 1]['study_id']
        
        results[f'{label} {emb}_max'] = top_n_max.isin(label_positives).sum() / n
        results[f'{label} {emb}_mean'] = top_n_mean.isin(label_positives).sum() / n

print(results)

In [None]:
results = {}

# Loop over each label
for label in labels:
    n = n_counts[label]  # Retrieve the count of positives for the current label
    # Initialize sub-dictionary for storing accuracy scores for the current label
    label_results = {}
    
    # Filter only the required embedding types
    filtered_embedding_types = ['average_embedding', 'average_cosine_similarity', 'max_cosine_similarity']
    
    for emb in filtered_embedding_types:
        # Assume cosine_max and cosine_mean DataFrames are already defined and have the required structure
        max_df = cosine_max[(cosine_max['label'] == label) & (cosine_max['embedding_type'] == emb)]
        mean_df = cosine_mean[(cosine_mean['label'] == label) & (cosine_mean['embedding_type'] == emb)]

        # Calculate max and mean accuracies
        top_n_max = max_df.nlargest(n, 'cosine_similarity')['study_id']
        top_n_mean = mean_df.nlargest(n, 'cosine_similarity')['study_id']

        label_positives = valid_labels_df[valid_labels_df[label] == 1]['study_id']
        
        # Store the calculated accuracies in the label_results
        label_results[f'{emb}_max_accuracy'] = top_n_max.isin(label_positives).sum() / n
        label_results[f'{emb}_mean_accuracy'] = top_n_mean.isin(label_positives).sum() / n

    # Add the label_results to the main results dictionary with the label as the key
    results[label] = label_results

# Convert the dictionary to a DataFrame where each row corresponds to a label and each column to an accuracy score
results_df = pd.DataFrame.from_dict(results, orient='index')

# Save the DataFrame to a CSV file
results_df.to_csv('/home/imadejski/ctds-search-model/data_analysis/accuracy_results.csv', index=True, index_label='Label')

# Optionally print the DataFrame
print(results_df)

In [None]:
top_k_results = {}

for label in labels:
    label_results_k = {}
    
    # Filter only the required embedding types
    filtered_embedding_types = ['average_embedding', 'average_cosine_similarity', 'max_cosine_similarity']
    
    for emb in filtered_embedding_types:
        # Assume cosine_max and cosine_mean DataFrames are already defined and have the required structure
        max_df = cosine_max[(cosine_max['label'] == label) & (cosine_max['embedding_type'] == emb)]
        mean_df = cosine_mean[(cosine_mean['label'] == label) & (cosine_mean['embedding_type'] == emb)]

        # Calculate top n accuracies for different values of n
        for k in [5, 10, 15]:
            top_k_max = max_df.nlargest(k, 'cosine_similarity')['study_id']
            top_k_mean = mean_df.nlargest(k, 'cosine_similarity')['study_id']

            label_positives_k = valid_labels_df[valid_labels_df[label] == 1]['study_id']
            
            # Store the calculated accuracies in the label_results
            label_results_k[f'{emb}_max_accuracy_top_{k}'] = top_k_max.isin(label_positives).sum() / k
            label_results_k[f'{emb}_mean_accuracy_top_{k}'] = top_k_mean.isin(label_positives).sum() / k

    # Add the label_results to the main results dictionary with the label as the key
    top_k_results[label] = label_results_k

# Convert the dictionary to a DataFrame where each row corresponds to a label and each column to an accuracy score
top_k_results_df = pd.DataFrame.from_dict(top_k_results, orient='index')

# Save the DataFrame to a CSV file
top_k_results_df.to_csv('/home/imadejski/ctds-search-model/data_analysis/mimic_top_k_accuracy_results.csv', index=True, index_label='Label')

# Optionally print the DataFrame
print(top_k_results_df)