# Radgraph Processing

Turning extracts into discernable form for BERTopic and Processing

In [6]:
import torch 
import pandas as pd 

csv_radgraph_path = "./cohort_ner_database.csv"
tensor_radgraph_path = "./cohort_ner_database.pt"

radgraph_tensor = torch.load(tensor_radgraph_path) # already a df 
sample = radgraph_tensor.iloc[1]['extracts']
sample

  radgraph_tensor = torch.load(tensor_radgraph_path) # already a df


{'0': {'text': 'heart size is mildly enlarged . there is mild unfolding of the thoracic aorta . cardiomediastinal silhouette and hilar contours are otherwise unremarkable . there is mild bibasilar atelectasis . lungs are otherwise clear . pleural surfaces are clear without effusion or pneumothorax . focus of air seen under the right hemidiaphragm , likely represents colonic interposition . no acute cardiopulmonary abnormality .',
  'entities': {'1': {'tokens': 'heart',
    'label': 'Anatomy::definitely present',
    'start_ix': 0,
    'end_ix': 0,
    'relations': []},
   '2': {'tokens': 'size',
    'label': 'Anatomy::definitely present',
    'start_ix': 1,
    'end_ix': 1,
    'relations': [['modify', '1']]},
   '3': {'tokens': 'mildly',
    'label': 'Observation::definitely present',
    'start_ix': 3,
    'end_ix': 3,
    'relations': [['modify', '4']]},
   '4': {'tokens': 'enlarged',
    'label': 'Observation::definitely present',
    'start_ix': 4,
    'end_ix': 4,
    'relations'

In [2]:
radgraph_tensor

Unnamed: 0,study_id,subject_id,report_path,full_text,examination,indication,technique,comparison,findings,impression,has_comparison,report_length,radgraph_text,extracts
0,51967283,10000980,../../../severity_data/report_files/p10/p10000...,FINAL REPORT INDICATION: [REMOVED]-year-old fe...,,[REMOVED]-year-old female with shortness of br...,,chest radiograph from [REMOVED] and [REMOVED]....,,"right upper lobe pneumonia or mass. however, g...",True,797,,"{'0': {'text': 'nan', 'entities': {}, 'data_so..."
1,54369281,10001884,../../../severity_data/report_files/p10/p10001...,FINAL REPORT EXAMINATION: Chest radiograph IND...,chest radiograph,dyspnea.,ap and lateral views of the chest.,[REMOVED],heart size is mildly enlarged. there is mild u...,no acute cardiopulmonary abnormality.,True,560,heart size is mildly enlarged. there is mild u...,{'0': {'text': 'heart size is mildly enlarged ...
2,58838312,10002428,../../../severity_data/report_files/p10/p10002...,FINAL REPORT PORTABLE CHEST: [REMOVED]. HISTOR...,,,,,single portable view of the chest is compared ...,no significant interval change with bilateral ...,False,761,single portable view of the chest is compared ...,{'0': {'text': 'single portable view of the ch...
3,52139270,10003502,../../../severity_data/report_files/p10/p10003...,FINAL REPORT HISTORY: Nausea and chllls. TECHN...,,,frontal and lateral views of the chest.,[REMOVED].,there are moderate bilateral pleural effusions...,"bilateral pleural effusions, cardiomegaly and ...",True,648,there are moderate bilateral pleural effusions...,{'0': {'text': 'there are moderate bilateral p...
4,57662923,10004322,../../../severity_data/report_files/p10/p10004...,WET READ: [REMOVED] [REMOVED] 2:03 PM Streaky ...,chest (pa and lat),history: [REMOVED]m with 2 weeks productive co...,pa and lateral views of the chest provided.,chest radiograph dated [REMOVED].,subtle streaky opacity in the left lower lobe ...,streaky left lobe opacity may reflect atelecta...,True,1345,subtle streaky opacity in the left lower lobe ...,{'0': {'text': 'subtle streaky opacity in the ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5545,51053775,19941135,../../../severity_data/report_files/p19/p19941...,FINAL REPORT EXAMINATION: CHEST (PA AND LAT) I...,chest (pa and lat),history: [REMOVED]f with cough,chest pa and lateral,none.,cardiac silhouette size is normal. mediastinal...,focal ill-defined opacity in the left lung bas...,True,694,cardiac silhouette size is normal. mediastinal...,{'0': {'text': 'cardiac silhouette size is nor...
5546,52939041,19951239,../../../severity_data/report_files/p19/p19951...,WET READ: [REMOVED] [REMOVED] [REMOVED] 6:49 P...,,history: [REMOVED]f with fever and cough // r/...,chest pa and lateral,none available,pa and lateral chest radiograph demonstrates a...,right upper lobe pneumonia.,True,585,pa and lateral chest radiograph demonstrates a...,{'0': {'text': 'pa and lateral chest radiograp...
5547,52809931,19963862,../../../severity_data/report_files/p19/p19963...,"FINAL REPORT HISTORY: Pneumonia for 9 days, ev...",,,frontal and lateral views of the chest.,none.,there is a consolidation in the right lower lo...,right lower lobe pneumonia. follow-up to resol...,True,673,there is a consolidation in the right lower lo...,{'0': {'text': 'there is a consolidation in th...
5548,53101264,19992525,../../../severity_data/report_files/p19/p19992...,FINAL REPORT CHEST RADIOGRAPH PERFORMED ON [RE...,,,,chest radiograph dated [REMOVED]. clinical his...,pa and lateral views of the chest were provide...,band-like opacity in the right lower lung coul...,True,792,pa and lateral views of the chest were provide...,{'0': {'text': 'pa and lateral views of the ch...


## Radgraph Processor - Entities and Relations

In [3]:
import json
import pandas as pd

def process_radgraph_extracts(extracts_json):
    """
    Converts RadGraph 'extracts' into structured clinical text for BERTopic.

    Args:
    - extracts_json (str or dict): RadGraph JSON stored in the 'extracts' column.

    Returns:
    - str: Reformatted text with preserved relations.
    """
    if isinstance(extracts_json, str):
        extracts_json = json.loads(extracts_json)  # Convert string to dict if needed

    # Extract only the first key (assumed report ID or study ID)
    first_key = list(extracts_json.keys())[0]
    radgraph_data = extracts_json[first_key]

    entities = radgraph_data.get("entities", {})
    sentences = []
    
    entity_map = {eid: entity["tokens"] for eid, entity in entities.items()}  # ID → Text map

    # Convert relations into readable text
    for eid, entity in entities.items():
        entity_text = entity["tokens"]
        entity_label = entity["label"].split("::")[0]  # Keep main category (Anatomy, Observation)
        relations = entity.get("relations", [])

        # Base description of the entity
        entity_sentence = f"{entity_text} is an {entity_label.lower()}."
        
        # Add relational information
        for rel in relations:
            relation_type, target_eid = rel
            target_text = entity_map.get(target_eid, "")
            
            if relation_type == "modify":
                entity_sentence = f"{entity_text} modifies {target_text}."
            elif relation_type == "located_at":
                entity_sentence = f"{entity_text} is located at {target_text}."
            elif relation_type == "associated_with":
                entity_sentence = f"{entity_text} is associated with {target_text}."
        
        sentences.append(entity_sentence)

    return " ".join(sentences)

# Apply processing to the entire 'extracts' column
# Apply to a sample (radgraph extract for the first study)
sample_extract = process_radgraph_extracts(radgraph_tensor['extracts'][0])
print(sample_extract)

radgraph_tensor["processed_radgraph"] = radgraph_tensor["extracts"].apply(process_radgraph_extracts)
radgraph_tensor




Unnamed: 0,study_id,subject_id,report_path,full_text,examination,indication,technique,comparison,findings,impression,has_comparison,report_length,radgraph_text,extracts,processed_radgraph
0,51967283,10000980,../../../severity_data/report_files/p10/p10000...,FINAL REPORT INDICATION: [REMOVED]-year-old fe...,,[REMOVED]-year-old female with shortness of br...,,chest radiograph from [REMOVED] and [REMOVED]....,,"right upper lobe pneumonia or mass. however, g...",True,797,,"{'0': {'text': 'nan', 'entities': {}, 'data_so...",
1,54369281,10001884,../../../severity_data/report_files/p10/p10001...,FINAL REPORT EXAMINATION: Chest radiograph IND...,chest radiograph,dyspnea.,ap and lateral views of the chest.,[REMOVED],heart size is mildly enlarged. there is mild u...,no acute cardiopulmonary abnormality.,True,560,heart size is mildly enlarged. there is mild u...,{'0': {'text': 'heart size is mildly enlarged ...,heart is an anatomy. size modifies heart. mild...
2,58838312,10002428,../../../severity_data/report_files/p10/p10002...,FINAL REPORT PORTABLE CHEST: [REMOVED]. HISTOR...,,,,,single portable view of the chest is compared ...,no significant interval change with bilateral ...,False,761,single portable view of the chest is compared ...,{'0': {'text': 'single portable view of the ch...,enteric modifies tube. tube is an observation....
3,52139270,10003502,../../../severity_data/report_files/p10/p10003...,FINAL REPORT HISTORY: Nausea and chllls. TECHN...,,,frontal and lateral views of the chest.,[REMOVED].,there are moderate bilateral pleural effusions...,"bilateral pleural effusions, cardiomegaly and ...",True,648,there are moderate bilateral pleural effusions...,{'0': {'text': 'there are moderate bilateral p...,moderate modifies effusions. bilateral modifie...
4,57662923,10004322,../../../severity_data/report_files/p10/p10004...,WET READ: [REMOVED] [REMOVED] 2:03 PM Streaky ...,chest (pa and lat),history: [REMOVED]m with 2 weeks productive co...,pa and lateral views of the chest provided.,chest radiograph dated [REMOVED].,subtle streaky opacity in the left lower lobe ...,streaky left lobe opacity may reflect atelecta...,True,1345,subtle streaky opacity in the left lower lobe ...,{'0': {'text': 'subtle streaky opacity in the ...,subtle modifies opacity. streaky modifies opac...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5545,51053775,19941135,../../../severity_data/report_files/p19/p19941...,FINAL REPORT EXAMINATION: CHEST (PA AND LAT) I...,chest (pa and lat),history: [REMOVED]f with cough,chest pa and lateral,none.,cardiac silhouette size is normal. mediastinal...,focal ill-defined opacity in the left lung bas...,True,694,cardiac silhouette size is normal. mediastinal...,{'0': {'text': 'cardiac silhouette size is nor...,cardiac is an anatomy. silhouette modifies car...
5546,52939041,19951239,../../../severity_data/report_files/p19/p19951...,WET READ: [REMOVED] [REMOVED] [REMOVED] 6:49 P...,,history: [REMOVED]f with fever and cough // r/...,chest pa and lateral,none available,pa and lateral chest radiograph demonstrates a...,right upper lobe pneumonia.,True,585,pa and lateral chest radiograph demonstrates a...,{'0': {'text': 'pa and lateral chest radiograp...,airspace modifies lobe. opacity is located at ...
5547,52809931,19963862,../../../severity_data/report_files/p19/p19963...,"FINAL REPORT HISTORY: Pneumonia for 9 days, ev...",,,frontal and lateral views of the chest.,none.,there is a consolidation in the right lower lo...,right lower lobe pneumonia. follow-up to resol...,True,673,there is a consolidation in the right lower lo...,{'0': {'text': 'there is a consolidation in th...,consolidation is located at lobe. right modifi...
5548,53101264,19992525,../../../severity_data/report_files/p19/p19992...,FINAL REPORT CHEST RADIOGRAPH PERFORMED ON [RE...,,,,chest radiograph dated [REMOVED]. clinical his...,pa and lateral views of the chest were provide...,band-like opacity in the right lower lung coul...,True,792,pa and lateral views of the chest were provide...,{'0': {'text': 'pa and lateral views of the ch...,severe modifies dextroscoliosis. dextroscolios...


In [5]:
radgraph_tensor.to_csv('./processed_NER_extracts.csv')
torch.save(radgraph_tensor, './processed_NER_extracts.pt')

In [7]:
radgraph_tensor = pd.read_csv('./processed_NER_extracts.csv')
radgraph_tensor.iloc[1]['processed_radgraph']

'heart is an anatomy. size modifies heart. mildly modifies enlarged. enlarged is located at heart. mild modifies unfolding. unfolding is located at aorta. thoracic modifies aorta. aorta is an anatomy. cardiomediastinal is an anatomy. silhouette modifies cardiomediastinal. hilar is an anatomy. contours modifies hilar. unremarkable is located at hilar. mild modifies atelectasis. bibasilar is an anatomy. atelectasis is located at bibasilar. lungs is an anatomy. clear is located at lungs. pleural is an anatomy. surfaces modifies pleural. clear is located at pleural. effusion is located at pleural. pneumothorax is an observation. air is located at colonic. under modifies hemidiaphragm. right modifies hemidiaphragm. hemidiaphragm is an anatomy. colonic is an anatomy. interposition is located at colonic. acute modifies abnormality. cardiopulmonary is an anatomy. abnormality is located at cardiopulmonary.'