# Code Designated for Generating BERTopic Embeddings from NER 

### Load Radgraph Entities 

In [41]:
import torch 
import pandas as pd 

csv_radgraph_path = "../../NER/ner_database.csv"
tensor_radgraph_path = "../../NER/ner_database.pt"

radgraph_tensor = torch.load(tensor_radgraph_path) # already a df 
sample = radgraph_tensor.iloc[0]['extracts']
sample

  radgraph_tensor = torch.load(tensor_radgraph_path) # already a df


{'0': {'text': 'cardiac silhouette size is normal . mediastinal and hilar contours are unremarkable . the pulmonary vasculature is normal . ill - defined parenchymal opacities are noted bilaterally , most pronounced within both lung bases , concerning for multifocal pneumonia . no pleural effusion or pneumothorax is present . there are no acute osseous abnormalities . findings concerning for multifocal pneumonia . followup radiographs after treatment are recommended to ensure resolution of this finding .',
  'entities': {'1': {'tokens': 'cardiac',
    'label': 'Anatomy::definitely present',
    'start_ix': 0,
    'end_ix': 0,
    'relations': []},
   '2': {'tokens': 'silhouette',
    'label': 'Anatomy::definitely present',
    'start_ix': 1,
    'end_ix': 1,
    'relations': [['modify', '1']]},
   '3': {'tokens': 'size',
    'label': 'Anatomy::definitely present',
    'start_ix': 2,
    'end_ix': 2,
    'relations': [['modify', '1']]},
   '4': {'tokens': 'normal',
    'label': 'Observa

In [45]:
radgraph_tensor

Unnamed: 0,study_id,subject_id,report_path,full_text,examination,indication,technique,comparison,findings,impression,has_comparison,report_length,radgraph_text,extracts
0,57106576,18110461,../data/files/p18/p18110461/s57106576.txt,FINAL REPORT EXAMINATION: CHEST (PA AND LAT) I...,chest (pa and lat),history: [REMOVED]f with cough,chest pa and lateral,[REMOVED],cardiac silhouette size is normal. mediastinal...,findings concerning for multifocal pneumonia. ...,True,645,cardiac silhouette size is normal. mediastinal...,{'0': {'text': 'cardiac silhouette size is nor...
1,52444794,15447063,../data/files/p15/p15447063/s52444794.txt,FINAL REPORT INDICATION: History of left-sided...,,history of left-sided weakness for 12 hours. p...,ap and lateral radiographs of the chest.,,mild cardiomegaly has been stable compared to ...,interval increase in consolidation at the left...,False,806,mild cardiomegaly has been stable compared to ...,{'0': {'text': 'mild cardiomegaly has been sta...
2,58791719,13243522,../data/files/p13/p13243522/s58791719.txt,FINAL REPORT EXAMINATION: Chest radiograph IND...,chest radiograph,[REMOVED]m with cystic fibrosis and fever/coug...,frontal and lateral view.,comparison is made to multiple chest radiograp...,right chest wall port-a-cath ends at the cavoa...,chronic changes of cystic fibrosis as describe...,True,1136,right chest wall port-a-cath ends at the cavoa...,{'0': {'text': 'right chest wall port - a - ca...
3,51779043,11423061,../data/files/p11/p11423061/s51779043.txt,FINAL REPORT HISTORY: Chest pain. TECHNIQUE: U...,,,upright ap view of the chest.,[REMOVED].,low lung volumes limit assessment of the lung ...,low lung volumes limit assessment of the lung ...,True,685,low lung volumes limit assessment of the lung ...,{'0': {'text': 'low lung volumes limit assessm...
4,58785779,15379716,../data/files/p15/p15379716/s58785779.txt,"FINAL REPORT INDICATION: Pneumonia, cough, and...",,"pneumonia, cough, and shortness of breath.",,chest radiograph from [REMOVED] and chest ct f...,,stable lingular and increased right middle lob...,True,600,,"{'0': {'text': 'nan', 'entities': {}, 'data_so..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2754,58255867,14236258,../data/files/p14/p14236258/s58255867.txt,FINAL REPORT INDICATION: [REMOVED]M with cough...,,[REMOVED]m with cough // r/o pna,ap and lateral views of the chest.,[REMOVED]. [REMOVED] and ct torso from [REMOVED].,vague opacity projecting over the right mid/lo...,"vague right mid/lower opacity, nonspecific the...",True,695,vague opacity projecting over the right mid/lo...,{'0': {'text': 'vague opacity projecting over ...
2755,59289169,10623647,../data/files/p10/p10623647/s59289169.txt,WET READ: [REMOVED] [REMOVED] [REMOVED] 4:26 P...,chest: frontal and lateral views,history: [REMOVED]m with hypoxia // pna?,chest: frontal and lateral,[REMOVED],bilateral patchy pulmonary opacities appear sl...,patchy bilateral mid to lower lung opacities a...,True,1760,bilateral patchy pulmonary opacities appear sl...,{'0': {'text': 'bilateral patchy pulmonary opa...
2756,55182265,15116068,../data/files/p15/p15116068/s55182265.txt,WET READ: [REMOVED] [REMOVED] [REMOVED] 11:56 ...,chest radiograph.,"[REMOVED]f with cough, fever, sob // ? pna",single portable semi upright radiograph the ch...,"chest ct: [REMOVED], [REMOVED]",extensive bronchiectasis is again noted in the...,"extensive bilateral bronchiectasis, with super...",True,1100,extensive bronchiectasis is again noted in the...,{'0': {'text': 'extensive bronchiectasis is ag...
2757,50696726,17025867,../data/files/p17/p17025867/s50696726.txt,FINAL REPORT INDICATION: Cough and weakness. C...,,cough and weakness.,,radiograph available from [REMOVED]. frontal a...,,1. increase in density of a right lower and mi...,True,649,,"{'0': {'text': 'nan', 'entities': {}, 'data_so..."


### BERTopic Using RADGRAPH ENTITIES ONLY 

In [46]:
import json

def extract_radgraph_entities(radgraph_dict):
    entities = json.loads(radgraph_dict)
    return [e["text"] for e in entities["entities"]]

# BERTopic Using RADGRAPH ENTITIES AND RELATIONS 

In [44]:
import json

def process_radgraph_json(radgraph_json):
    """
    Converts RadGraph entities and relations into structured clinical text for BERTopic.

    Args:
    - radgraph_json (dict or str): RadGraph JSON output (single report).

    Returns:
    - str: Reformatted text with preserved relations.
    """
    if isinstance(radgraph_json, str):
        radgraph_json = json.loads(radgraph_json)

    entities = radgraph_json.get("entities", {})
    sentences = []
    
    entity_map = {}  # Mapping from entity ID to text
    for eid, entity in entities.items():
        entity_map[eid] = entity["tokens"]

    # Process relations into readable text
    for eid, entity in entities.items():
        entity_text = entity["tokens"]
        entity_label = entity["label"].split("::")[0]  # Keep only main category (Anatomy, Observation)
        relations = entity.get("relations", [])

        # Base description of the entity
        entity_sentence = f"{entity_text} is an {entity_label.lower()}."
        
        # Relational information (three options)
        for rel in relations: 
            relation_type, target_eid = rel
            target_text = entity_map.get(target_eid, "")
            
            if relation_type == "modify":
                entity_sentence = f"{entity_text} modifies {target_text}."
            elif relation_type == "located_at":
                entity_sentence = f"{entity_text} is located at {target_text}."
            elif relation_type == "associated_with":
                entity_sentence = f"{entity_text} is associated with {target_text}."
        
        sentences.append(entity_sentence)

    return " ".join(sentences)

# The first row of the radgraph extracts list 
sample_radgraph_json = {
    'text': "cardiac silhouette size is normal. mediastinal and hilar contours are unremarkable. pulmonary vasculature is normal. ill-defined parenchymal opacities are noted bilaterally, most pronounced within both lung bases, concerning for multifocal pneumonia.",
    'entities': {
        '1': {'tokens': 'cardiac', 'label': 'Anatomy::definitely present', 'relations': []},
        '2': {'tokens': 'silhouette', 'label': 'Anatomy::definitely present', 'relations': [['modify', '1']]},
        '3': {'tokens': 'size', 'label': 'Anatomy::definitely present', 'relations': [['modify', '1']]},
        '4': {'tokens': 'normal', 'label': 'Observation::definitely present', 'relations': [['located_at', '1']]},
        '5': {'tokens': 'pneumonia', 'label': 'Observation::definitely present', 'relations': [['associated_with', '6']]},
        '6': {'tokens': 'parenchymal opacities', 'label': 'Observation::definitely present', 'relations': []}
    }
}

formatted_text = process_radgraph_json(sample_radgraph_json)
print(formatted_text)

cardiac is an anatomy. silhouette modifies cardiac. size modifies cardiac. normal is located at cardiac. pneumonia is associated with parenchymal opacities. parenchymal opacities is an observation.


In [50]:
import json
import pandas as pd

def process_radgraph_extracts(extracts_json):
    """
    Converts RadGraph 'extracts' into structured clinical text for BERTopic.

    Args:
    - extracts_json (str or dict): RadGraph JSON stored in the 'extracts' column.

    Returns:
    - str: Reformatted text with preserved relations.
    """
    if isinstance(extracts_json, str):
        extracts_json = json.loads(extracts_json)  # Convert string to dict if needed

    # Extract only the first key (assumed report ID or study ID)
    first_key = list(extracts_json.keys())[0]
    radgraph_data = extracts_json[first_key]

    entities = radgraph_data.get("entities", {})
    sentences = []
    
    entity_map = {eid: entity["tokens"] for eid, entity in entities.items()}  # ID → Text map

    # Convert relations into readable text
    for eid, entity in entities.items():
        entity_text = entity["tokens"]
        entity_label = entity["label"].split("::")[0]  # Keep main category (Anatomy, Observation)
        relations = entity.get("relations", [])

        # Base description of the entity
        entity_sentence = f"{entity_text} is an {entity_label.lower()}."
        
        # Add relational information
        for rel in relations:
            relation_type, target_eid = rel
            target_text = entity_map.get(target_eid, "")
            
            if relation_type == "modify":
                entity_sentence = f"{entity_text} modifies {target_text}."
            elif relation_type == "located_at":
                entity_sentence = f"{entity_text} is located at {target_text}."
            elif relation_type == "associated_with":
                entity_sentence = f"{entity_text} is associated with {target_text}."
        
        sentences.append(entity_sentence)

    return " ".join(sentences)

# Apply processing to the entire 'extracts' column
# Apply to a sample (radgraph extract for the first study)
sample_extract = process_radgraph_extracts(radgraph_tensor['extracts'][0])
print(sample_extract)

radgraph_tensor["processed_radgraph"] = radgraph_tensor["extracts"].apply(process_radgraph_extracts)
radgraph_tensor


cardiac is an anatomy. silhouette modifies cardiac. size modifies cardiac. normal is located at cardiac. mediastinal is an anatomy. hilar is an anatomy. contours modifies hilar. unremarkable is located at hilar. pulmonary modifies vasculature. vasculature is an anatomy. normal is located at vasculature. ill - defined modifies opacities. parenchymal is an anatomy. opacities is located at lung. bilaterally modifies parenchymal. pronounced is an observation. both modifies lung. lung is an anatomy. bases modifies lung. multifocal modifies pneumonia. pneumonia is an observation. pleural is an anatomy. effusion is located at pleural. pneumothorax is an observation. acute modifies abnormalities. osseous is an anatomy. abnormalities is located at osseous. multifocal modifies pneumonia. pneumonia is an observation.


In [53]:
radgraph_tensor.to_csv('./processed_NER_extracts/processed_NER_extracts.csv')
torch.save(radgraph_tensor, './processed_NER_extracts/processed_NER_extracts.pt')