# Multimodal Topic Analysis

In [8]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from glob import glob

# Define path to TFRecord files
tfrecord_folder = "../../severity_data/image_files/"  # Adjust if needed

# Find all TFRecord files recursively
tfrecord_files = glob(os.path.join(tfrecord_folder, "**/*.tfrecord"), recursive=True)

# Define feature description for parsing TFRecords
feature_description = {
    "image/id": tf.io.FixedLenFeature([], tf.string),   # DICOM file path
    "embedding": tf.io.VarLenFeature(tf.float32),       # Image embedding
}

# Function to extract image embeddings from a TFRecord file
def extract_embeddings_from_tfrecord(tfrecord_path):
    dataset = tf.data.TFRecordDataset([tfrecord_path])
    dicom_ids = []
    embeddings = []

    for raw_record in dataset:
        parsed_example = tf.io.parse_single_example(raw_record, feature_description)

        # Extract DICOM path and ID
        dicom_path = parsed_example["image/id"].numpy().decode("utf-8")
        dicom_id = dicom_path.split("/")[-1].replace(".dcm", "")  # Extract filename without extension

        # Extract image embedding
        embedding = tf.sparse.to_dense(parsed_example["embedding"]).numpy()

        dicom_ids.append(dicom_id)
        embeddings.append(embedding.tolist())

    return dicom_ids, embeddings

# Process all TFRecord files
all_dicom_ids = []
all_embeddings = []

for tfrecord_file in tfrecord_files:
    dicom_ids, embeddings = extract_embeddings_from_tfrecord(tfrecord_file)
    all_dicom_ids.extend(dicom_ids)
    all_embeddings.extend(embeddings)

# Convert to DataFrame
image_embeddings_df = pd.DataFrame({"dicom_id": all_dicom_ids, "embedding": all_embeddings})

# Save extracted embeddings to CSV
image_embeddings_df.to_csv("image_embeddings.csv", index=False)

print("Extracted image embeddings saved to image_embeddings.csv!")

2025-02-13 01:35:48.911263: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-02-13 01:35:48.967862: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-02-13 01:35:49.055560: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-02-13 01:35:49.238009: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-02-13 01:35:49.555391: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-02-13 01:35:50.147550: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-02-13 01:35:51.324321: I tensorflow/core/framework/local_rendezvous.cc:405] L

: 

In [7]:
import pandas as pd
import numpy as np

# Load datasets
image_embeddings_df = pd.read_csv("../../images/image_embeddings.csv")
text_embeddings_df = pd.read_csv("../NER_embeddings/pneumonia_type/radgraph_with_embeddings.csv")

# Merge datasets on 'dicom_id'
merged_df = pd.merge(text_embeddings_df, image_embeddings_df[['dicom_id', 'embedding']], on='dicom_id', suffixes=('_text', '_image'))

# Convert embeddings from string to list of floats
merged_df['embedding_text'] = merged_df['embedding_text'].apply(lambda x: np.array(eval(x)))
merged_df['embedding_image'] = merged_df['embedding_image'].apply(lambda x: np.array(eval(x)))

# Concatenate text and image embeddings to create multimodal embeddings
merged_df['embedding_multimodal'] = merged_df.apply(lambda row: np.concatenate([row['embedding_text'], row['embedding_image']]), axis=1)

# Extract final multimodal embeddings for BERTopic
multimodal_embeddings = np.vstack(merged_df['embedding_multimodal'].values)

ValueError: Cannot set a DataFrame with multiple columns to the single column embedding_multimodal

In [3]:
merged_df

Unnamed: 0.2,Unnamed: 0.1,subject_id_x,hadm_id,study_id,dicom_id,pneumonia_type,Severe,Unnamed: 0,subject_id_y,report_path,...,findings,impression,has_comparison,report_length,radgraph_text,extracts,processed_radgraph,embedding_text,embedding_image,embedding_multimodal
0,1,18110461,20001947,57106576,1f239460-e00a31a5-81bdb260-f2929be7-f7cb2f7d,bacterial,False,3906,18110461,../../../severity_data/report_files/p18/p18110...,...,cardiac silhouette size is normal. mediastinal...,findings concerning for multifocal pneumonia. ...,True,645,cardiac silhouette size is normal. mediastinal...,{'0': {'text': 'cardiac silhouette size is nor...,cardiac is an anatomy. silhouette modifies car...,"[0.03332185745239258, -0.005314456298947334, -...","[-1.0478514432907104, -0.9893836975097656, 0.9...","[0.03332185745239258, -0.005314456298947334, -..."
1,6,15447063,20009511,52444794,82d06e5f-4f17e47a-3a96a851-d9454252-862ff5be,bacterial,False,2557,15447063,../../../severity_data/report_files/p15/p15447...,...,mild cardiomegaly has been stable compared to ...,interval increase in consolidation at the left...,False,806,mild cardiomegaly has been stable compared to ...,{'0': {'text': 'mild cardiomegaly has been sta...,mild modifies cardiomegaly. cardiomegaly is an...,"[0.0257903840392828, -0.0005293460562825203, 0...","[0.14750519394874573, -0.8777117133140564, 1.2...","[0.0257903840392828, -0.0005293460562825203, 0..."
2,8,13243522,20013465,58791719,fc215cad-d1060eb4-32e74e2e-15e2c40b-d01e66db,bacterial,False,1530,13243522,../../../severity_data/report_files/p13/p13243...,...,right chest wall port-a-cath ends at the cavoa...,chronic changes of cystic fibrosis as describe...,True,1136,right chest wall port-a-cath ends at the cavoa...,{'0': {'text': 'right chest wall port - a - ca...,right modifies wall. chest modifies wall. wall...,"[0.038594670593738556, -0.015855595469474792, ...","[-0.6212253570556641, -0.6505540609359741, 1.1...","[0.038594670593738556, -0.015855595469474792, ..."
3,9,11423061,20014114,51779043,9b1832b0-9cf4f257-fd4bbb42-06957578-0bf69814,bacterial,True,670,11423061,../../../severity_data/report_files/p11/p11423...,...,low lung volumes limit assessment of the lung ...,low lung volumes limit assessment of the lung ...,True,685,low lung volumes limit assessment of the lung ...,{'0': {'text': 'low lung volumes limit assessm...,low is located at volumes. lung is an anatomy....,"[0.030176309868693352, -0.024805540218949318, ...","[-0.6631452441215515, -0.5168229341506958, 1.0...","[0.030176309868693352, -0.024805540218949318, ..."
4,10,15379716,20015580,58785779,120b1593-d73e54d8-6d999909-ce1d93ce-d3b26f46,bacterial,False,2511,15379716,../../../severity_data/report_files/p15/p15379...,...,,stable lingular and increased right middle lob...,True,600,,"{'0': {'text': 'nan', 'entities': {}, 'data_so...",,"[-0.035868603736162186, -0.13423331081867218, ...","[-0.08448687195777893, -1.0461037158966064, 1....","[-0.035868603736162186, -0.13423331081867218, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,4435,14236258,29989743,58255867,0f33dea2-1c4e6245-7b21b568-ef0299e9-03c0863a,bacterial,False,1992,14236258,../../../severity_data/report_files/p14/p14236...,...,vague opacity projecting over the right mid/lo...,"vague right mid/lower opacity, nonspecific the...",True,695,vague opacity projecting over the right mid/lo...,{'0': {'text': 'vague opacity projecting over ...,vague modifies opacity. opacity is located at ...,"[0.04184424504637718, -0.01704668439924717, 0....","[-0.12456560134887695, -0.08151178061962128, 1...","[0.04184424504637718, -0.01704668439924717, 0...."
2592,4437,10623647,29991969,59289169,f47aa7aa-8461e734-cc03c9ac-f152661b-4700bd0c,bacterial,False,287,10623647,../../../severity_data/report_files/p10/p10623...,...,bilateral patchy pulmonary opacities appear sl...,patchy bilateral mid to lower lung opacities a...,True,1760,bilateral patchy pulmonary opacities appear sl...,{'0': {'text': 'bilateral patchy pulmonary opa...,bilateral modifies pulmonary. patchy modifies ...,"[0.03257104381918907, -0.026108980178833008, -...","[-0.27105075120925903, -0.9785194993019104, 1....","[0.03257104381918907, -0.026108980178833008, -..."
2593,4438,15116068,29993812,55182265,65d1408e-bc59a65c-0be0fa6d-fb54b613-45abd8c1,bacterial,False,2406,15116068,../../../severity_data/report_files/p15/p15116...,...,extensive bronchiectasis is again noted in the...,"extensive bilateral bronchiectasis, with super...",True,1100,extensive bronchiectasis is again noted in the...,{'0': {'text': 'extensive bronchiectasis is ag...,extensive modifies bronchiectasis. bronchiecta...,"[0.03614714369177818, -0.008272165432572365, -...","[-0.37828612327575684, -0.8936127424240112, 0....","[0.03614714369177818, -0.008272165432572365, -..."
2594,4439,17025867,29996361,50696726,99011231-5f716ee8-5e61eadd-447b48c4-4ec5255a,bacterial,False,3357,17025867,../../../severity_data/report_files/p17/p17025...,...,,1. increase in density of a right lower and mi...,True,649,,"{'0': {'text': 'nan', 'entities': {}, 'data_so...",,"[-0.035868603736162186, -0.13423331081867218, ...","[0.11463172733783722, -0.3663841485977173, 0.7...","[-0.035868603736162186, -0.13423331081867218, ..."


In [None]:
topic_model.visualize_barchart(top_n_topics=100)