In [None]:
import pandas as pd
import requests
import torch
import numpy as np

from health_multimodal.text import TextInferenceEngine
from health_multimodal.text.utils import BertEncoderType, get_bert_inference

In [None]:
convirt_queries_df = pd.read_csv("/home/imadejski/ctds-search-model/data/convirt_queries.csv")

In [None]:
convirt_labels = [
    "Pneumothorax",
    "Pneumonia",
    "Fracture",
    "Cardiomegaly",
    "Pleural Effusion",
    "Edema",
    "Atelectasis",
    "No Finding"
]

In [None]:
def make_query_expansions(labels):
    entries = []
    for label in labels:
        entries.extend([
            {"Variable": label, "Text": f"Findings consistent with {label}"},
            {"Variable": label, "Text": f"Findings suggesting {label}"},
            {"Variable": label, "Text": f"Findings are most compatible with {label}"},
            {"Variable": label, "Text": f"{label} seen"},
        ])

    df = pd.DataFrame(entries)
    return df

In [None]:
curr_queries_df = make_query_expansions(convirt_labels)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
inference_engine = get_bert_inference(bert_encoder_type=BertEncoderType.BIOVIL_T_BERT)

In [None]:
inference_engine = get_bert_inference(bert_encoder_type=BertEncoderType.CXR_BERT)

In [None]:
def calculate_similarities_convirt(df1, df2, inference_engine):
    """
    Calculates the pairwise cosine similarities between text cells in two DataFrames 
    with the same 'Variable' value and returns a DataFrame with the results.

    Parameters:
    - df1: First DataFrame
    - df2: Second DataFrame

    Returns:
    - result_df: DataFrame containing 'Variable', 'Average Similarity', and 'Max Similarity'
    """
    results = []

    for index1, row1 in df1.iterrows():
        variable = row1['Variable']
        text1 = row1['Text']

        df2_filtered = df2[df2['Variable'] == variable]

        if not df2_filtered.empty:
            similarities = []

            for index2, row2 in df2_filtered.iterrows():
                text2 = row2['Text']
                similarity_tensor = inference_engine.get_pairwise_similarities(text1, text2)
                similarity_value = similarity_tensor.item()  # Convert tensor to scalar
                similarities.append(similarity_value)

            comparisons = similarities + [np.nan] * (5 - len(similarities))

            comparisons_array = np.array(comparisons, dtype=object)

            average_similarity = np.nanmean(comparisons)
            max_similarity = np.nanmax(comparisons)

            results.append({
                'Variable': variable,
                'Text': text1,
                'ConVIRT Query 1': comparisons[0],
                'ConVIRT Query 2': comparisons[1],
                'ConVIRT Query 3': comparisons[2],
                'ConVIRT Query 4': comparisons[3],
                'ConVIRT Query 5': comparisons[4],
                'Average Similarity': average_similarity,
                'Max Similarity': max_similarity
            })

    result_df = pd.DataFrame(results)
    return result_df

In [None]:
similarity_curr_queries_df = calculate_similarities(curr_queries_df, convirt_queries_df, inference_engine)
print(similarity_curr_queries_df)

In [None]:
similarity_curr_queries_df.to_csv("/home/imadejski/ctds-search-model/data/convirt_query_comparison.csv", index=False)

In [None]:
def calculate_similarities_real(df1, df2, inference_engine):
    """
    Calculates the pairwise cosine similarities between text cells in two DataFrames 
    with the same 'Variable' value and returns a DataFrame with the results.

    Parameters:
    - df1: First DataFrame
    - df2: Second DataFrame

    Returns:
    - result_df: DataFrame containing 'Variable', 'Average Similarity', and 'Max Similarity'
    """
    results = []

    for index1, row1 in df1.iterrows():
        variable = row1['Variable']
        text1 = row1['Text']

        df2_filtered = df2[df2['Variable'] == variable]

        if not df2_filtered.empty:
            similarities = []

            for index2, row2 in df2_filtered.iterrows():
                text2 = row2['Text']
                similarity_tensor = inference_engine.get_pairwise_similarities(text1, text2)
                similarity_value = similarity_tensor.item()  # Convert tensor to scalar
                similarities.append(similarity_value)

            if similarities:
                average_similarity = np.nanmean(similarities)
                max_similarity = np.nanmax(similarities)
            else:
                average_similarity = np.nan
                max_similarity = np.nan

            results.append({
                'Variable': variable,
                'Text': text1,
                'Average Similarity': average_similarity,
                'Max Similarity': max_similarity
            })

    result_df = pd.DataFrame(results)
    return result_df

In [None]:
real_queries_df = pd.read_csv("/home/imadejski/ctds-search-model/data/real_sentences.csv")

In [None]:
similarity_curr_queries_real_df = calculate_similarities_real(curr_queries_df, real_queries_df, inference_engine)
print(similarity_curr_queries_real_df)

In [None]:
similarity_curr_queries_real_df.to_csv("/home/imadejski/ctds-search-model/data/real_query_comparison.csv", index=False)