In [None]:
import pandas as pd

Import chexpert labels file

In [None]:
chexpert_file_path = "/opt/gpudata/mimic-cxr/mimic-cxr-2.0.0-chexpert.csv.gz"
chexpert_df = pd.read_csv(chexpert_file_path)

Import split file and extract validation images

In [None]:
file_path = '/opt/gpudata/mimic-cxr/mimic-cxr-2.0.0-split.csv.gz'
split_df = pd.read_csv(file_path)

validate_df = split_df[split_df['split'] == 'validate']
validate_study_ids = validate_df['study_id'].unique()
validate_chexpert_df = chexpert_df[chexpert_df['study_id'].isin(validate_study_ids)]

In [None]:
def check_percent_prevalence(df, label):
    positive_count = df[df[label] == 1].shape[0]
    total_rows = df.shape[0]
    percentage = (positive_count / total_rows) * 100
    return percentage

In [None]:
def print_percent(df, label):
    percent = check_percent_prevalence(df, label)
    print(label, ":", percent)

chexpert_labels = ["Atelectasis", "Cardiomegaly", "Consolidation", "Edema", "Enlarged Cardiomediastinum", "Fracture", "Lung Lesion",
"Lung Opacity", "No Finding", "Pleural Effusion", "Pleural Other", "Pneumonia", "Pneumothorax", "Support Devices"]

print("Validation Set Prevalence")
for i in chexpert_labels:
    print_percent(validate_chexpert_df, i)

In [None]:
print("All Sample Prevalence")
for i in chexpert_labels:
    print_percent(chexpert_df, i)

In [None]:
cosine_similarity_file_path = "/home/imadejski/ctds-search-model/data/cosine_similarity_csvs/avg_cosine_similarity_single_pos_label.csv"
cossim_df = pd.read_csv(cosine_similarity_file_path)

In [None]:
threshold = 0.25
count_above_threshold = cossim_df[cossim_df["Atelectasis"] > threshold].shape[0]
total_rows = cossim_df.shape[0]
percentage_above_threshold = (count_above_threshold / total_rows) * 100
print(percentage_above_threshold)

In [None]:
def chexpert_positive_matches(df, label, value_to_get="study_id"):
    filtered_values = df[df[label] == 1][value_to_get]
    return filtered_values.tolist()

In [None]:
def cossim_top_n_matches(df, label, top_n, value_to_get="study_id"):
    sorted_df = df.sort_values(by=label, ascending=False)
    top_values = sorted_df.head(top_n)[value_to_get]
    return top_values.tolist()

In [None]:
def check_accuracy(chexpert_df, cossim_df, label):
    chex_pos_set = set(chexpert_positive_matches(chexpert_df, label))
    top_n = len(chex_pos_set)

    cossim_pos_set = set(cossim_top_n_matches(cossim_df, label, top_n))

    intersection = chex_pos_set.intersection(cossim_pos_set)
    union = chex_pos_set.union(cossim_pos_set)

    if len(union) == 0:
        return 0.0

    overlap_percentage = (len(intersection) / len(union)) * 100

    return overlap_percentage
    

In [None]:
for label in chexpert_labels:
    print(label, check_accuracy(validate_chexpert_df, cossim_df, label))

In [None]:
chex_pos_list = chexpert_positive_matches(validate_chexpert_df, "Atelectasis")
chex_pos_set = set(pos_list)

cossim_pos_list = cossim_top_n_matches(cossim_df, "atelelectasis_cos_sim")