# Making NER Bacterial Embeddings from Processed Radgraph Text

In [None]:
import pandas as pd
import numpy as np
 
 # Load radgraph embeddings from pneumonia type file 
pneumonia_type_radgraph_path = '../../NER/bacterial/processed_pneumonia_type_radgraph.csv'
pneumonia_type_radgraph_df = pd.read_csv(pneumonia_type_radgraph_path)


In [4]:
pneumonia_type_radgraph_df

Unnamed: 0.2,Unnamed: 0.1,subject_id_x,hadm_id,study_id,dicom_id,pneumonia_type,Severe,Unnamed: 0,subject_id_y,report_path,...,indication,technique,comparison,findings,impression,has_comparison,report_length,radgraph_text,extracts,processed_radgraph
0,0,11146739,20000057,57595001,d46de6bb-6ee08654-dd5f127c-1c339f58-f780e465,other,False,536,11146739,../../../severity_data/report_files/p11/p11146...,...,,,none.,there are coarse interstitial markings bilater...,,True,1130,,"{'0': {'text': 'nan', 'entities': {}, 'data_so...",
1,1,18110461,20001947,57106576,1f239460-e00a31a5-81bdb260-f2929be7-f7cb2f7d,bacterial,False,3906,18110461,../../../severity_data/report_files/p18/p18110...,...,history: [REMOVED]f with cough,chest pa and lateral,[REMOVED],cardiac silhouette size is normal. mediastinal...,findings concerning for multifocal pneumonia. ...,True,645,cardiac silhouette size is normal. mediastinal...,{'0': {'text': 'cardiac silhouette size is nor...,cardiac is an anatomy. silhouette modifies car...
2,2,18346104,20002712,56389963,33125f0b-b2a6d1a2-1408ce03-33729f30-68e6fd64,other,False,4007,18346104,../../../severity_data/report_files/p18/p18346...,...,[REMOVED]f with hypoxia. evaluate for focal co...,single portable ap view of the chest.,chest radiograph from [REMOVED].,"compared with the prior radiograph, no signifi...","compared with the prior radiograph, no change ...",True,693,"compared with the prior radiograph, no signifi...",{'0': {'text': 'compared with the prior radiog...,significant modifies change. change modifies o...
3,3,19669999,20005479,56328195,927ce6d1-2f7acc58-55042c9a-e688c114-129c4009,other,True,4612,19669999,../../../severity_data/report_files/p19/p19669...,...,"hypoxia, sepsis. comparisons: [REMOVED].",,,lung volumes are low. a large hiatal hernia ca...,low lung volumes and large hiatal hernia. new ...,False,712,lung volumes are low. a large hiatal hernia ca...,{'0': {'text': 'lung volumes are low . a large...,lung is an anatomy. volumes modifies lung. low...
4,4,12351713,20007567,59692077,aa33b0f7-a2e3dc48-68b210fb-fb0a6ea5-bd1bf388,other,False,1138,12351713,../../../severity_data/report_files/p12/p12351...,...,,pa and lateral chest radiographs,prior outside chest ct from [REMOVED] and ches...,lung volumes are decreased. there is susbtanti...,low lung volumes with substantial bibasilar at...,True,569,lung volumes are decreased. there is susbtanti...,{'0': {'text': 'lung volumes are decreased . t...,lung is an anatomy. volumes modifies lung. dec...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4437,4437,10623647,29991969,59289169,f47aa7aa-8461e734-cc03c9ac-f152661b-4700bd0c,bacterial,False,287,10623647,../../../severity_data/report_files/p10/p10623...,...,history: [REMOVED]m with hypoxia // pna?,chest: frontal and lateral,[REMOVED],bilateral patchy pulmonary opacities appear sl...,patchy bilateral mid to lower lung opacities a...,True,1760,bilateral patchy pulmonary opacities appear sl...,{'0': {'text': 'bilateral patchy pulmonary opa...,bilateral modifies pulmonary. patchy modifies ...
4438,4438,15116068,29993812,55182265,65d1408e-bc59a65c-0be0fa6d-fb54b613-45abd8c1,bacterial,False,2406,15116068,../../../severity_data/report_files/p15/p15116...,...,"[REMOVED]f with cough, fever, sob // ? pna",single portable semi upright radiograph the ch...,"chest ct: [REMOVED], [REMOVED]",extensive bronchiectasis is again noted in the...,"extensive bilateral bronchiectasis, with super...",True,1100,extensive bronchiectasis is again noted in the...,{'0': {'text': 'extensive bronchiectasis is ag...,extensive modifies bronchiectasis. bronchiecta...
4439,4439,17025867,29996361,50696726,99011231-5f716ee8-5e61eadd-447b48c4-4ec5255a,bacterial,False,3357,17025867,../../../severity_data/report_files/p17/p17025...,...,cough and weakness.,,radiograph available from [REMOVED]. frontal a...,,1. increase in density of a right lower and mi...,True,649,,"{'0': {'text': 'nan', 'entities': {}, 'data_so...",
4440,4440,10053207,29999444,55471183,d3362c86-7b4199a0-a47e828f-dddd9bcb-89c90d8e,other,True,17,10053207,../../../severity_data/report_files/p10/p10053...,...,,,[REMOVED].,frontal and lateral views of the chest. prior ...,right basilar opacity likely due to at least s...,True,1150,frontal and lateral views of the chest. prior ...,{'0': {'text': 'frontal and lateral views of t...,right - sided is an anatomy. central modifies ...


In [6]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.preprocessing import normalize
import pandas as pd

class RadGraphEmbedder:
    def __init__(self, pretrained_model_name, pooling_strategy='cls'):
        """
        Initialize the RadGraphEmbedder with a specified pooling strategy.

        Parameters:
            pretrained_model_name: Name of the pretrained model to load
            pooling_strategy: Pooling strategy ('mean' or 'cls')
        """
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(pretrained_model_name, trust_remote_code=True)
        self.model.eval()  # Set to evaluation mode
        self.pooling_strategy = pooling_strategy.lower()  # Ensure case-insensitivity
        
        if self.pooling_strategy not in ['mean', 'cls']:
            raise ValueError("Invalid pooling strategy! Use 'mean' or 'cls'.")

    def create_embeddings(self, texts):
        """
        Create embeddings for a list of RadGraph processed extracts.

        Parameters:
            texts: List of structured RadGraph extracts

        Returns:
            Normalized embeddings as a numpy array
        """
        embeddings = []
        batch_size = 32
        
        with torch.no_grad():
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i + batch_size]
                
                inputs = self.tokenizer(
                    batch_texts,
                    padding=True,
                    truncation=True,
                    max_length=512,  # Adjust max length if needed
                    return_tensors="pt"
                )
                
                outputs = self.model(**inputs)
                
                # Apply the selected pooling strategy
                if self.pooling_strategy == 'cls':
                    # Use [CLS] token embedding
                    batch_embeddings = outputs.last_hidden_state[:, 0, :]
                else:  # Mean pooling
                    attention_mask = inputs['attention_mask']
                    batch_embeddings = self._mean_pooling(outputs.last_hidden_state, attention_mask)
                
                embeddings.append(batch_embeddings.numpy())
        
        embeddings = np.vstack(embeddings)
        embeddings = normalize(embeddings)
        
        return embeddings

    def _mean_pooling(self, token_embeddings, attention_mask):
        """
        Perform mean pooling on token embeddings using the attention mask.
        """
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def process_radgraph_extracts(file_path, pretrained_model_name, pooling_strategy='mean'):
    """
    Process RadGraph extracts and create embeddings with the specified pooling strategy.

    Parameters:
        file_path: Path to the CSV file containing RadGraph extracts
        pretrained_model_name: Name of the pretrained model to use
        pooling_strategy: Pooling strategy ('mean' or 'cls')

    Returns:
        DataFrame with embeddings added
    """
    # Load dataset
    df = pd.read_csv(file_path)

    # Ensure column exists
    if 'processed_radgraph' not in df.columns:
        raise ValueError("Column 'processed_radgraph' not found in dataset!")

    # Initialize the embedder
    embedder = RadGraphEmbedder(pretrained_model_name, pooling_strategy=pooling_strategy)
    
    # Extract embeddings
    embeddings = embedder.create_embeddings(df['processed_radgraph'].fillna("").tolist())
    
    # Save embeddings
    embeddings_path = "radgraph_embeddings.pt"
    torch.save(embeddings, embeddings_path)
    print(f"Embeddings saved to: {embeddings_path}")

    # Add embeddings to DataFrame
    df['embedding'] = embeddings.tolist()
    
    return df

# Define model name and pooling strategy
pretrained_model_name = "microsoft/BiomedVLP-CXR-BERT-general"  # Clinical BERT model
pooling_strategy = "mean"  # 'mean' or 'cls'

# File path to your dataset
processed_file_path = '../../NER/bacterial/processed_pneumonia_type_radgraph.csv'

# Process and generate embeddings for RadGraph structured extracts
radgraph_embeddings_df = process_radgraph_extracts(processed_file_path, pretrained_model_name, pooling_strategy)

# Save processed dataset with embeddings
radgraph_embeddings_df.to_csv("radgraph_with_embeddings.csv", index=False)

Embeddings saved to: radgraph_embeddings.pt
