In [None]:
import pandas as pd

In [None]:
labels_path = "/opt/gpudata/midrc-sift/labels.csv"
labels_df = pd.read_csv(labels_path)

cosine_path = '/home/imadejski/ctds-search-model/data/midrc_multiple_pos_cossim.csv'
cosine_df = pd.read_csv(cosine_path)

In [None]:
labels = [
    'Blood Stagnation', 'Bronchiectasis and Infection', 'Bronchitis', 'Bullae',
    'COVID-19 Ground Glass', 'Calcification', 'Chronic Bronchitis with Infection',
    'Clavicle Fracture (staleness)', 'Diaphragmatic Hernia', 'Emphysema',
    'Encapsulated Effusion', 'Excessive Blood Flow', 'Fibrogenesis',
    'Hardening of the Arteries', 'Heart Disease-Pulmonary Congestion',
    'Honeycomb Lung Tuberculosis', 'Hydropneumothorax', 'Infection',
    'Interlobar Pleurisy', 'Interstitial Pneumonia', 'Intraperitoneal Free Air',
    'Localized Diaphragmatic Eventration', 'Lung Cyst/Bronchial Cyst', 'Mass',
    'Metastases', 'Miliary Tuberculosis and Other Disseminated Tuberculosis',
    'No Finding', 'Nodule', 'Pleural Calcification', 'Pleural Effusion',
    'Pleural Fibrosis with Calcification', 'Pleural Thickening', 'Pleurisy', 
    'Pneumonia', 'Pneumothorax', 'Postoperative Denaturation', 
    'Pulmonary Atelectasis', 'Pulmonary Hilars Enlarged and Thicken', 
    'Pulmonary Mesenchyme Denaturation', 'Rib Deformity', 'Rib Fracture', 
    'Rib Fracture (staleness)', 'Secondary Pulmonary Tuberculosis', 
    'Subcutaneous Emphysema', 'Tuberculous Fibrosis of Lung'
]

embedding_types = [
    'query_1', 'query_2', 'query_3', 'query_4', 'avg_emb', 'max_cossim'
]

In [None]:
data = []
for label in labels:
    for emb in embedding_types:
        column_name = f'{label} {emb}'
        if column_name in cosine_df.columns:
            temp_df = cosine_df[['series_uid', column_name]].copy()
            temp_df.rename(columns={column_name: 'cosine_similarity'}, inplace=True)
            temp_df['label'] = label
            temp_df['embedding_type'] = emb
            temp_df['cosine_similarity'] = pd.to_numeric(temp_df['cosine_similarity'], errors='coerce')
            data.append(temp_df)
        else:
            print(f"Warning: Column {column_name} does not exist in cosine_df")

if data:
    cosine_df_transformed = pd.concat(data, ignore_index=True)
else:
    print("No data to concatenate. Please check the input DataFrame and configurations.")

In [None]:
n_counts = {}

for label in labels:
    label_positives = labels_df[labels_df[label] == 1]
    unique_positive_study_ids = label_positives['series_uid'].unique()
    n_counts[label] = len(unique_positive_study_ids)

print(n_counts)

In [None]:
results_list = []

for label in labels:
    for emb in embedding_types:
        # Filter data for the current label and embedding type
        temp_df = cosine_df_transformed[(cosine_df_transformed['label'] == label) & (cosine_df_transformed['embedding_type'] == emb)]

        # Determine the number of top entries based on the number of positives for this label (global n for this label)
        n = n_counts.get(label, 0)
        
        # Check if there are any series_uids to process
        if n > 0 and not temp_df.empty:
            # Get the top n entries globally for this label and embedding type
            top_n = temp_df.nlargest(n, 'cosine_similarity')

            # Get series_uids that have a positive label in labels_df
            positive_uids = labels_df[labels_df[label] == 1]['series_uid'].unique()

            # Calculate how many of these top n entries are correct
            correct_count = top_n[top_n['series_uid'].isin(positive_uids)].shape[0]

            # Calculate the percentage of correct predictions
            correct_percentage = (correct_count / n) * 100

            # Create a DataFrame for these results and add to the list
            results_list.append(pd.DataFrame({
                'label': [label],
                'embedding_type': [emb],
                'correct_percentage': [correct_percentage]
            }))

# Concatenate all DataFrame objects in the list into a single DataFrame
results = pd.concat(results_list, ignore_index=True)

# Display or save the results
print(results)

In [None]:
csv_file_path = '/home/imadejski/ctds-search-model/data_analysis/accuracy_results_midrc.csv'
results.to_csv(csv_file_path, index=False)

In [None]:
top_k_results = []

for label in labels:
    for emb in embedding_types:
        # Filter data for the current label and embedding type
        temp_df = cosine_df_transformed[(cosine_df_transformed['label'] == label) & (cosine_df_transformed['embedding_type'] == emb)]

        # Determine the number of top entries based on the number of positives for this label (global n for this label)
        n = n_counts.get(label, 0)
        
        # Check if there are any series_uids to process
        if n > 0 and not temp_df.empty:
            # Get the top n entries globally for this label and embedding type
            top_n = temp_df.nlargest(n, 'cosine_similarity')

            # Get series_uids that have a positive label in labels_df
            positive_uids = labels_df[labels_df[label] == 1]['series_uid'].unique()

            # Calculate how many of these top n entries are correct
            correct_count = top_n[top_n['series_uid'].isin(positive_uids)].shape[0]

            # Calculate the percentage of correct predictions
            correct_percentage = (correct_count / n) * 100

            # Create a DataFrame for these results and add to the list
            results_list.append(pd.DataFrame({
                'label': [label],
                'embedding_type': [emb],
                'correct_percentage': [correct_percentage]
            }))

# Concatenate all DataFrame objects in the list into a single DataFrame
results = pd.concat(results_list, ignore_index=True)

csv_file_path = '/home/imadejski/ctds-search-model/data_analysis/accuracy_results_midrc.csv'
results.to_csv(csv_file_path, index=False)

# Display or save the results
print(results)

In [None]:
top_k_results = []

top_k_values = [5, 10, 15]

for label in labels:
    for emb in embedding_types:
        # Filter data for the current label and embedding type
        temp_df = cosine_df_transformed[(cosine_df_transformed['label'] == label) & (cosine_df_transformed['embedding_type'] == emb)]

        # Check if there are any series_uids to process
        if not temp_df.empty:
            # Get series_uids that have a positive label in labels_df
            positive_uids = labels_df[labels_df[label] == 1]['series_uid'].unique()

            for k in top_k_values:
                # Get the top k entries globally for this label and embedding type
                top_k = temp_df.nlargest(k, 'cosine_similarity')

                # Calculate how many of these top k entries are correct
                correct_count = top_k[top_k['series_uid'].isin(positive_uids)].shape[0]

                # Calculate the percentage of correct predictions
                correct_percentage = (correct_count / k) * 100

                # Create a DataFrame for these results and add to the list
                top_k_results.append(pd.DataFrame({
                    'label': [label],
                    'embedding_type': [emb],
                    'top_k': [k],
                    'correct_percentage': [correct_percentage]
                }))

# Concatenate all DataFrame objects in the list into a single DataFrame
top_k_results = pd.concat(top_k_results, ignore_index=True)

csv_file_path = '/home/imadejski/ctds-search-model/data_analysis/accuracy_results_midrc_top_k.csv'
top_k_results.to_csv(csv_file_path, index=False)

# Display or save the results
print(results)

In [None]:
prevalence = {}
total_cases = len(labels_df)

for label in labels_df.columns:
    if label != 'series_uid' and label != 'image_uid':
        positive_cases = labels_df[label].sum()
        prevalence[label] = (positive_cases / total_cases) * 100

prevalence_df = pd.DataFrame(list(prevalence.items()), columns=['Label', 'Prevalence (%)'])
midrc_prevalence_path = '/home/imadejski/ctds-search-model/data_analysis/midrc_prevalence.csv'
prevalence_df.to_csv(midrc_prevalence_path, index=False)