# Installing Necessary Packages

In [87]:
!pip -q install zai-sdk
!pip -q install openai
!pip -q install google-generativeai

# Import Packages

In [88]:
import os
import sys
import re
import time
import json
import random
import logging
import warnings
import requests

import numpy as np
import pandas as pd

from google import genai
from google.genai.types import GenerateContentConfig

from abc import ABC, abstractmethod

from zai import ZaiClient
from openai import OpenAI
from tqdm import tqdm
from kaggle_secrets import UserSecretsClient

pd.set_option('display.max_colwidth', None)
warnings.filterwarnings('ignore')

# Logging

In [89]:
logger = logging.getLogger()      # root logger
logger.setLevel(logging.WARNING)

if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
    handler.setFormatter(formatter)
    logger.addHandler(handler)

# System Prompt

In [90]:
PROMPT_TEMPLATE = """
You are an expert clinical NLP annotator extracting structured entities from CHEST CT radiology reports.

OUTPUT: Valid JSON array only. No markdown, no explanation, no comments.

SCHEMA:
[
  {
    "general_finding": "string or None",
    "specific_finding": "string or None",
    "finding_presence": "present | absent | uncertain | None",
    "location": [],
    "degree": [],
    "measurement": "string or None",
    "comparison": "stable | improved | worsened | None"
  }
]

═══════════════════════════════════════════════════
FIELD DEFINITIONS
═══════════════════════════════════════════════════

general_finding: The CLINICAL CATEGORY of what is being described.
  USE the clinical entity name, NOT the anatomical structure, NOT a generic category.

  CORRECT examples:
    "cardiomegaly", "pericardial effusion", "pulmonary embolus",
    "atherosclerotic plaque", "coronary artery calcification",
    "lymphadenopathy", "pleural effusion", "pneumothorax",
    "catheter", "endotracheal tube", "nasogastric tube",
    "hematoma", "fracture", "intramural hematoma",
    "lung parenchyma", "thyroid gland", "heart", "aorta",
    "thoracic aorta", "aortic arch", "pulmonary artery",
    "pulmonary arteries", "trachea", "central airways",
    "airways", "liver", "gallbladder", "bone",
    "lymph nodes", "pulmonary nodules", "pulmonary nodule",
    "soft tissue", "upper abdomen", "spine",
    "cholecystitis", "aortic pathology", "consolidation",
    "vertebral artery", "prior disease", "thyroid nodules",
    "great vessels", "upper abdominal organs"

  WRONG — never use generic categories like:
    ✗ "thyroid abnormality" → use "thyroid gland" or "thyroid nodules"
    ✗ "cardiac abnormality" → use "coronary artery calcification" or "heart"
    ✗ "vascular abnormality" → use "aorta", "pulmonary artery", etc.
    ✗ "parenchymal abnormality" → use "lung parenchyma" or "consolidation"
    ✗ "pleural abnormality" → use "pleural effusion" or "pneumothorax"
    ✗ "skeletal abnormality" → use "bone" or "fracture"
    ✗ "airway abnormality" → use "airways" or "trachea"
    ✗ "lymph node abnormality" → use "lymphadenopathy" or "lymph nodes"
    ✗ "liver lesion" → use "liver"
    ✗ "abdominal abnormality" → use "upper abdomen"

  For devices: general_finding = the device name itself:
    "catheter", "endotracheal tube", "nasogastric tube"

  For absent pathology: general_finding = the pathology name:
    "pericardial effusion" (not "pericardium" or "pericardial space")
    "pulmonary embolus" (not "pulmonary arteries")
    "cholecystitis" (not "gallbladder")
    "lymphadenopathy" (not "lymph nodes")
    "pneumothorax" (not "lungs")

specific_finding: The specific observation or descriptor.
  Use concise clinical terminology, not raw sentence fragments.
  CORRECT: "aberrant origin", "subcutaneous hematoma", "filling defects"
  WRONG: "left vertebral artery arising directly from the aorta" (too verbose)

location: Array of anatomical sites. ALWAYS populate when the report states a location.
  Include laterality. Include both insertion site AND tip position for devices.

degree: Array of qualifiers (mild, moderate, severe, small, focal, diffuse, patchy, etc.).
  Include descriptors like "suspicious", "significant", "bibasilar", "subcentimeter", "scattered".
  For unremarkable/normal findings, degree = ["unremarkable"] or ["normal"].
  Use empty array [] when no qualifiers apply. Never put "None" as a string inside the array.

measurement: Exact numeric value with units from text, or "None".

comparison: Only set if report compares to prior imaging. Otherwise "None".

═══════════════════════════════════════════════════
ENTITY GRANULARITY RULES
═══════════════════════════════════════════════════

RULE 1: ONE SENTENCE DESCRIBING ONE THING = ONE ENTITY
  Each distinct clinical observation gets its own entity.

RULE 2: SPLIT SEPARATE ANATOMICAL STRUCTURES
  "The aorta and great vessels are normal" = TWO entities:
    1. general_finding: "aorta", specific_finding: "aorta course and caliber"
    2. general_finding: "great vessels", specific_finding: "great vessels course and caliber"

  "Trachea and central bronchi are patent" = TWO entities:
    1. general_finding: "trachea", specific_finding: "patency"
    2. general_finding: "bronchi", specific_finding: "patency" (or "central airways"/"patency")

  "The trachea and central airways are patent" = TWO entities:
    1. general_finding: "trachea", specific_finding: "patency"
    2. general_finding: "central airways", specific_finding: "patency"

RULE 3: SPLIT SEPARATE CLINICAL FINDINGS ON SAME STRUCTURE
  "Heart is normal in size with a trace pericardial effusion" = TWO entities:
    1. general_finding: "heart", specific_finding: "heart size", degree: ["normal"]
    2. general_finding: "pericardial effusion", specific_finding: "pericardial effusion", degree: ["trace"]

  "Normal contour and caliber with moderate atherosclerotic plaque" = TWO entities:
    1. Contour/caliber entity
    2. Atherosclerotic plaque entity

  "Thyroid gland is heterogeneous with multiple small nodules in the right lobe" = TWO entities:
    1. general_finding: "thyroid gland", specific_finding: "heterogeneous thyroid", degree: ["heterogeneous"]
    2. general_finding: "thyroid nodules", specific_finding: "nodules", location: ["right thyroid lobe"], degree: ["multiple", "small"]

RULE 4: SEPARATE ABSENT PATHOLOGY FROM RELATED ABSENT PATHOLOGY
  "No surface irregularity to suggest intimal flap, dissection, or atheromatous ulcer"
  = TWO entities:
    1. general_finding: "thoracic aorta", specific_finding: "surface irregularity", finding_presence: "absent"
    2. general_finding: "aortic pathology", specific_finding: "intimal flap, dissection, or atheromatous ulcer", finding_presence: "absent"

RULE 5: MERGE — DEVICES
  Merge device insertion site + tip position into ONE entity with multi-location array.
  "Catheter in right IJ with tip in right atrium" → location: ["right internal jugular", "proximal right atrium"]

RULE 6: MERGE — GROUPED NEGATIVES (same clinical domain)
  "No focal consolidation, effusion or edema" → ONE entity:
    general_finding: "lung parenchyma", specific_finding: "consolidation, effusion, or edema", finding_presence: "absent"
  Include applicable qualifiers: degree: ["focal"]

RULE 7: MERGE — VERSUS / AND/OR STATEMENTS
  "consolidation versus atelectasis" or "atelectasis and/or scarring" → ONE entity:
    finding_presence: "uncertain"
    Keep wording: specific_finding: "consolidation versus atelectasis"

RULE 8: MERGE — COREFERENCES
  "Ground glass opacities are seen. These involve all lobes." → ONE entity

═══════════════════════════════════════════════════
IMPRESSION HANDLING
═══════════════════════════════════════════════════

- If IMPRESSION restates a FINDINGS entity with same information → extract ONLY ONCE (use FINDINGS version)
- If IMPRESSION adds new clinical significance → merge into the existing entity's degree
- Do NOT extract IMPRESSION-only summary statements that are not specific findings
  Example: "NO ACUTE VASCULAR INJURY" → Do NOT extract this as a separate entity (it's a summary)

═══════════════════════════════════════════════════
FINDING PRESENCE
═══════════════════════════════════════════════════

present: "is seen", "demonstrates", "present", "identified", "noted", "appreciated"
absent: "no", "without", "absent", "no evidence of", "negative for"
uncertain: "versus", "and/or", "possible", "suspicious for", "cannot exclude", "may represent"

═══════════════════════════════════════════════════
SPECIAL CASES
═══════════════════════════════════════════════════

NORMAL/UNREMARKABLE: Extract with finding_presence: "present", degree: ["unremarkable"] or ["normal"]
  "thyroid gland is unremarkable" → general_finding: "thyroid gland", degree: ["unremarkable"]
  "upper abdomen is unremarkable" → general_finding: "upper abdomen", degree: ["unremarkable"]

DEVICES: general_finding = device name. Location = [insertion site, tip position]. Measurement = distance if stated.
  "ET tube tip 2.5 cm above carina" → measurement: "2.5 cm above the carina"

SEQUELAE: Extract as separate entity.
  "calcified nodules, likely sequela of prior granulomatous process" → TWO entities:
    1. The calcified nodules
    2. general_finding: "prior disease", specific_finding: "sequela of prior granulomatous process"

LYMPH NODES:
  "no lymphadenopathy" → general_finding: "lymphadenopathy", finding_presence: "absent"
  "subcentimeter nodes present" → general_finding: "lymph nodes", finding_presence: "present", degree: ["subcentimeter"]
  Only extract lymphadenopathy as present if: "enlarged", "prominent", "pathologic"

LIMITED ASSESSMENT:
  "Lack of noncontrast images limits ability to assess for intramural hematoma"
  → general_finding: "intramural hematoma", finding_presence: "uncertain"

═══════════════════════════════════════════════════
COMMON ERRORS TO AVOID
═══════════════════════════════════════════════════

✗ Using generic categories ("vascular abnormality") instead of specific clinical terms
✗ Merging separate anatomical structures into one entity (trachea + bronchi, aorta + great vessels)
✗ Merging separate clinical findings on the same structure (heart size + pericardial effusion)
✗ Missing the heart size entity when it's stated
✗ Missing the pericardial effusion entity when it's stated (even if absent)
✗ Extracting IMPRESSION-only summary lines as new entities
✗ Leaving location empty when the report specifies a location
✗ Putting "None" as a string inside degree array (use empty [] instead)
✗ Missing qualifiers like "significant", "scattered", "bibasilar" in degree
✗ Using raw sentence fragments as specific_finding instead of concise terms

═══════════════════════════════════════════════════

NOW EXTRACT FROM THIS REPORT:

<<<REPORT_TEXT>>>

Output ONLY the JSON array.



"""

# API Keys

In [91]:
user_secrets = UserSecretsClient()

API_KEYS = {
    "gemini": user_secrets.get_secret("gemini_api_key_0"),
    "gemma": user_secrets.get_secret("gemini_api_key_0"),
    #"glm": user_secrets.get_secret("glm_api_key"),
    #"deepseek": user_secrets.get_secret("deepseek_api_key"),
}

# LLM Classes

In [92]:
class AIBaseModel(ABC):
    def __init__(self, api_key: str, model_name: str):
        self.api_key = api_key
        self.model_name = model_name
    
    @abstractmethod
    def invoke(self, prompt: str, **kwargs):
        raise NotImplementedError

In [93]:
class GeminiModel(AIBaseModel):
    def __init__(self, api_key: str, model_name: str = "gemini-2.5-flash"):
        self.model_name = model_name
        self.client = genai.Client(api_key=api_key)
        self.sleep_time = self._get_time_to_sleep()
        
    def _get_time_to_sleep(self):
        requests_per_minute = 15  # default
        
        if self.model_name == "gemini-2.5-flash":
            requests_per_minute = 5
        elif self.model_name == "gemini-3-flash-preview":
            requests_per_minute = 5
        elif self.model_name == "gemini-2.5-flash-lite":
            requests_per_minute = 10
        elif self.model_name == "gemini-1.5-flash":
            requests_per_minute = 15
        elif "gemma" in self.model_name:
            requests_per_minute = 30
            
        return 60 / requests_per_minute
    
    def invoke(
        self, 
        prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0, 
        top_p: float = 1, 
        max_tokens: int = 8192,
    ):
        try:
            response = self.client.models.generate_content(
                model=self.model_name,
                contents=prompt,
                config=GenerateContentConfig(
                    system_instruction=None,
                    temperature=temperature,
                    top_p=top_p,
                    max_output_tokens=max_tokens,
                ),
            )

            if hasattr(response, "candidates"):
                texts = []
                for c in response.candidates:
                    for p in getattr(c.content, "parts", []):
                        if getattr(p, "text", None):
                            texts.append(p.text)
                return "\n".join(texts) if texts else None
            return None
        except Exception as e:
            logger.error(f"Gemini API error: {e}")
        return None

In [94]:
class GLMModel(AIBaseModel):
    def __init__(self, api_key: str, model_name: str = "glm-4.5-flash"):
        super().__init__(api_key, model_name)
        self.client = ZaiClient(api_key=api_key)
        self.sleep_time = 6
        
    def invoke(
        self,
        prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.0,
        top_p: float = 1.0,
        max_tokens: int = 8192,
    ):
        try:
            messages = [
                {
                    "role": "system", 
                    "content": "You are a medical NLP system specialized in medical entity extraction from a given radiology report."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ]

            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=messages,
                temperature=temperature,
                top_p=top_p,
                max_tokens=max_tokens,
                stream=False,
            )

            if response.choices:
                return response.choices[0].message.content.strip()

            return None
        except Exception as e:
            logger.error(f"GLM API error: {e}")
            if hasattr(e, "status_code"):
                logger.error(f"Status code: {e.status_code}")
            if hasattr(e, "body"):
                logger.error(f"Error body: {e.body}")
            return None

In [95]:
class DeepSeekModel(AIBaseModel):
    def __init__(self, api_key: str, model_name: str = "deepseek-chat"):
        super().__init__(api_key, model_name)
        self.client = OpenAI(
            api_key=api_key,
            base_url="https://api.deepseek.com"
        )
        self.sleep_time = 3
        
    def invoke(
        self,
        prompt: str,
        system_prompt: str | None = None,
        temperature: float = 0.0,
        top_p: float = 1.0,
        max_tokens: int = 8192,
    ):
        try:
            messages = [
                {
                    "role": "system", 
                    "content": "You are a medical NLP system specialized in medical entity extraction from a given radiology report."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ]

            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=messages,
                temperature=temperature,
                top_p=top_p,
                max_tokens=max_tokens,
                stream=False,
            )

            return response.choices[0].message.content.strip()
        except Exception as e:
            logger.error(f"DeepSeek API error: {e}")
            return None


## Get Model

In [96]:
def get_ai_model(model: str, model_name: str):
    model_map = {
        "gemini": GeminiModel,
        "glm": GLMModel,
        "deepseek": DeepSeekModel
    }
    
    if model not in model_map:
        raise ValueError(f"Unvalid model: {model}. Choices: {list(model_map.keys())}")
    
    return model_map[model](API_KEYS[model], model_name)

In [97]:
def load_jsonl(path: str):
    with open(path, encoding="utf-8") as file:
        return [json.loads(line) for line in file if line.strip()]

def load_json(path: str):
    with open(path, encoding="utf-8") as file:
        return json.load(file)

In [98]:
def build_prompt(report: str) -> str:
    return PROMPT_TEMPLATE.replace("<<<REPORT_TEXT>>>", report)


In [99]:
def safe_parse_json(text: str):
    if not text:
        return None

    text = re.sub(r"```json|```", "", text, flags=re.IGNORECASE).strip()

    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # önce array yakala
    m = re.search(r"\[.*\]", text, re.S)
    if m:
        try:
            return json.loads(m.group())
        except Exception:
            pass

    # sonra object
    m = re.search(r"\{.*\}", text, re.S)
    if m:
        try:
            return json.loads(m.group())
        except Exception:
            return None

    return None


In [100]:
def run_inference_radgraph(
    dataset,
    output_path: str,
    model_id: str,
    model_name: str,
):
    model = get_ai_model(model_id, model_name)

    results = []
    for idx, sample in tqdm(enumerate(dataset), total=len(dataset), desc="Processing Samples"):
        
        prompt = build_prompt(sample["report"])

        raw_output = model.invoke(
            prompt=prompt,
            system_prompt=None
        )

        parsed = safe_parse_json(raw_output)

        print(f"Report: {sample['report']}")
        print(f"Parsed Output: {parsed}")
        
        result = {
            "dataset": sample["dataset"],
            "doc_key": sample["doc_key"],
            "report": sample["report"],
            "model": model_name,
            "entities": parsed,
        }
        
        results.append(result)

        if (idx + 1) % 5 == 0:
            temp_path = output_path.replace(".json", "_temp.json")
            with open(temp_path, "w") as f:
                f.write(json.dumps(results))
        

        time.sleep(model.sleep_time + 0.2)

    with open(output_path, "w") as f:
        for r in results:
            f.write(json.dumps(r) + "\n")

    return results

def run_inference_ratener(
    dataset,
    output_path: str,
    model_id: str,
    model_name: str,
):
    model = get_ai_model(model_id, model_name)

    results = []
    for idx, sample in tqdm(enumerate(dataset), total=len(dataset), desc="Processing Samples"):
        
        prompt = build_prompt(sample["report"])

        raw_output = model.invoke(
            prompt=prompt,
            system_prompt=None
        )

        parsed = safe_parse_json(raw_output)

        print(f"Report: {sample['report']}")
        print(f"Raw Output: {raw_output}")
        
        result = {
            "note_id": sample["note_id"],
            "report": sample["report"],
            "model": model_name,
            "entities": parsed,
        }
        
        results.append(result)

        if (idx + 1) % 5 == 0:
            temp_path = output_path.replace(".json", "_temp.json")
            with open(temp_path, "w") as f:
                f.write(json.dumps(results))
        

        time.sleep(model.sleep_time + 0.2)

    with open(output_path, "w") as f:
        for r in results:
            f.write(json.dumps(r) + "\n")

    return results

In [101]:
INPUT_PATH = "/kaggle/input/random20/radgraphxl-chest-ct-sample-20.json"
OUTPUT_PATH = "/kaggle/working/chest-ct-schema-random20.json"

dataset = load_json(INPUT_PATH)

results = run_inference_radgraph(
    dataset=dataset,
    output_path=OUTPUT_PATH,
    model_id="gemini",
    model_name="gemma-3-27b-it",
)

Processing Samples:   0%|          | 0/18 [00:00<?, ?it/s]

Report: CT Pulmonary Embolism Chest 12 - 05 - 06 HISTORY : 73 - year - old male with history of renal cell carcinoma now with chest pain and shortness of breath . Possibly pneumonia in the left upper lobe . Evaluate for pulmonary embolism . COMPARISON : CT chest abdomen and pelvis 10 - 19 - 2006 . TECHNIQUE : Multiple contiguous axial images were obtained from the thoracic inlet to the domes of the diaphragms utilizing the standard CT pulmonary embolism chest protocol . 100 mL of Isovue - 300 was given without adverse reaction . Additional 3 - D reconstructed images were performed and reviewed . FINDINGS : Subcentimeter hypodense lesions are seen in the thyroid gland . A large subcarinal lymph node is now noted measuring 21 mm which is new since the prior study . Left hilar / AP window lymphadenopathy has increased in size from prior study , now measuring 3 . 1 x 5 . 1 cm ( previously 2 . 8 x 3 . 8 cm ) . The heart is normal in size . There is no pericardial effusion . Coronary arteria

Processing Samples:   6%|▌         | 1/18 [00:32<09:08, 32.25s/it]

Report: Addendum Begins Coronal and sagittal MPR and MIP reformations were performed on an off - line 3D workstation and corroborate the findings . Addendum Ends CT PULMONARY ANGIOGRAM : 1 / 20 / 2021 COMPARISON : None . CLINICAL HISTORY : Acute on chronic dyspnea . TECHNIQUE : After administration of IV contrast , a pulmonary CT angiogram was obtained . FINDINGS : VASCULAR : Evaluation is limited at the bases of the lungs by motion . No evidence of pulmonary embolism . The pulmonary artery measures 27 mm in diameter , which is within normal limits . There is a tight stenosis of the origin of the left carotid artery and a mild stenosis of the left subclavian artery , just distal to its origin . Diffuse aortic calcifications are seen . In addition , there is a small splenic artery aneurysm at the splenic hilum measuring 10 mm in diameter . The aneurysm is heavily calcified peripherally . CHEST AND MEDIASTINUM : There is severe centrilobular emphysema , trending towards being panlobular 

Processing Samples:  11%|█         | 2/18 [01:01<08:04, 30.26s/it]

Report: EXAM : CT Pulmonary Embolism Chest Only JANUARY 21 , 2002 2 : 12 PM CLINICAL HISTORY : R / o PE COMPARISON : 00 , December 27th CONTRAST : 111 mL of Isovue 370 TECHNIQUE : After the uneventful administration of IV contrast , contiguous axial images were acquired through the thorax during the pulmonary arterial phase at 1 . 25 mm and 5 mm slice slice thicknesses . Axial images were then acquired from the pelvis to the knees during the systemic venous phase . Sagittal , coronal , and maximum intensity projection were performed also obtained . Multiplanar reformations and maximum intensity projections were performed on an independent 3D workstation and corroborate the findings . FINDINGS : The heart size is normal without a pericardial effusion . The aorta and great vessels are normal in course and caliber . The pulmonary arterial tree is well opacified , without filling defects to suggest pulmonary embolism . The main pulmonary artery is normal in course and caliber . Scattered a

Processing Samples:  17%|█▋        | 3/18 [01:32<07:40, 30.67s/it]

Report: CHEST CT : 9 / 27 / 2014 CLINICAL HISTORY : A 68 - year - old male with history of bronchiectasis . Follow - up . COMPARISON : 4 - 10 - 14 . TECHNIQUE : Unenhanced discontiguous axial high resolution CT imaging of the chest from the thoracic inlet to the upper abdomen including prone and supine positioning . FINDINGS : Mild coronary artery calcification is again noted . There is no pericardial or pleural effusion . The unenhanced appearance of the heart and thoracic vasculature is otherwise unremarkable . No suspicious , new mediastinal , hilar , or axillary adenopathy is identified . The central airways appear patent . There has been an interval increase in centrilobular ground - glass nodules within the left lower lobe , which is incompletely visualized , and likely related to infection and / or aspiration . The visualized lungs are otherwise unchanged . This includes an approximately 2 . 7 x 1 . 4 cm opacity with associated bronchiectasis in the posterior segment of the righ

Processing Samples:  22%|██▏       | 4/18 [02:18<08:34, 36.74s/it]

Report: CT Pulmonary Embolism Chest Only 1 / 21 / 2005 11 : 48 PM HISTORY : tachycardia , history of left thigh sarcoma COMPARISON : CT chest abdomen and pelvis 01 - 16 - 2005 TECHNIQUE : Multiplanar contrast - enhanced angiogram of the chest in pulmonary arterial phase was obtained after the uneventful administration of 110 mL Isovue 370 IV contrast . Curved planar reformats , volume rendered , and maximum intensity projection images were performed on an independent 3 - D workstation . FINDINGS : Exam is limited due to respiratory motion at the lung bases . The visualized thyroid gland is unremarkable . The main pulmonary artery is of normal caliber and there are no filling defects within the pulmonary arteries . The heart is normal in size , with a normal branching pattern of the great vessels . No pericardial effusion . Visualized great vessels and the thoracic and visualized abdominal aorta are of normal caliber . There are prominent axillary lymph nodes bilaterally , similar to pr

Processing Samples:  28%|██▊       | 5/18 [02:53<07:50, 36.23s/it]

Report: STUDY : CT chest high - resolution 03 / 24 / 20 at 1150 hrs HISTORY : 70 - year - old female with history of pulmonary fibrosis . Follow - up examination . COMPARISON : None TECHNIQUE : Multiple 1 mm axial CT images of the chest were obtained every 10 mm in the supine position and every 20 mm in the prone position . FINDINGS : Visualized portions of the thyroid gland are unremarkable . No supraclavicular , axillary , or mediastinal lymphadenopathy . Lack of intravenous contrast limits evaluation of the hila . Ascending aortic caliber is normal . The main pulmonary artery measures 35 mm in diameter , which is dilated and suggestive of pulmonary arterial hypertension . Heart is normal in size . No pericardial effusion . There is coronary and thoracic aortic arteriosclerosis . Mitral and aortic annular calcification is incidentally noted . Evaluation of the lung parenchyma demonstrates extensive peribronchial thickening , irregular parenchymal lines , peripheral intralobular and s

Processing Samples:  33%|███▎      | 6/18 [03:47<08:27, 42.27s/it]

Report: CT ANGIOGRAM CHEST CLINICAL HISTORY : Hypoxia . COMPARISON STUDY : Multiple prior CT scans , most recently from 9 / 20 / 2016 . TECHNIQUE : Following the IV administration of 95 cc of Isovue 370 , CT scan of the chest was performed from the thoracic inlet to the upper abdomen using a pulmonary embolus protocol . 3D reformations consisting of curved and multiplanar reformations , maximum intensity projections , and volume rendered images were performed on an independent workstation and corroborate the findings . IV contrast was administered without complication . FINDINGS : Thyroid : Imaged portions of the thyroid gland are unremarkable . Arch : The thoracic aorta is normal in caliber and demonstrates standard 3 - vessel arch anatomy . Heart : The heart is normal in size and configuration , without pericardial effusion . Pulmonary Vasculature : There is no evidence of pulmonary embolus in the main , lobar , or segmental pulmonary arteries . Lungs / Pleura : There is a 2 - mm pul

Processing Samples:  39%|███▉      | 7/18 [04:06<06:22, 34.77s/it]

Report: Addendum Begins MPR reformations and maximum intensity projections were performed on an independent 3D workstation and corroborate the findings . Addendum Ends CT ANGIOGRAM OF THE CHEST : OCTOBER 2006 COMPARISON : CT chest , abdomen and pelvis on 12 / 28 / 1997 , PET / CT on 2003 - 11 - 20 . CLINICAL HISTORY : A 45 - year - old man with history of papillary thyroid cancer and melanoma , presenting with shortness of breath . TECHNIQUE : Multiple 1 . 25 mm axial images of the thorax were obtained after the uneventful administration of IV contrast in the pulmonary arterial phase . Coronal and sagittal reformats were performed . FINDINGS : Evaluation of the pulmonary arteries demonstrates no filling defects to suggest pulmonary embolism . Evaluation of the mediastinum demonstrates surgical clips in the thyroid bed , compatible with previous thyroidectomy . Small amount of flat , non - confluent fibrofatty soft tissue is seen in the prevascular space , presumed to represent some res

Processing Samples:  44%|████▍     | 8/18 [04:27<05:02, 30.27s/it]

Report: EXAM : CT Angio Thorax 12 / 20 / 2004 10 : 49 AM CLINICAL HISTORY : 73 - year - old male , history of aortic stenosis . COMPARISON : None . CONTRAST : Isovue - 300 , 116 ml TECHNIQUE : After the uneventful administration of IV contrast , contiguous axial images were acquired through the thorax during the systemic arterial phase at 1 . 25 mm and 5 mm slice slice thicknesses . Sagittal , coronal and maximum intensity projection reformations were performed . Additional 3 - D reformatted images were performed and reviewed on an independent workstation . FINDINGS : VASCULATURE : The aortic valve is a calcified . The patient is status post 3 vessel CABG , with all 3 grafts appear patent . The left ventricle appears enlarged . The proximal right subclavian artery is ectatic , measuring 16 mm in diameter . The aorta is normal in course and caliber . The main pulmonary artery is normal in course and caliber . While not a dedicated pulmonary embolism study , no filling defects are seen i

Processing Samples:  50%|█████     | 9/18 [04:56<04:28, 29.80s/it]

Report: CT Pulmonary Embolism Chest Only 2 / 27 / 2011 6 : 11 PM HISTORY : 75 - year - old male with concern for pulmonary embolism . COMPARISON : Chest x - ray 10 / 28 / 04 TECHNIQUE : Multiplanar contrast - enhanced angiogram of the chest in pulmonary arterial phase was obtained after the uneventful administration of IV contrast . Curved planar reformats , volume rendered , and maximum intensity projection images were performed on an independent 3 - D workstation . CONTRAST : 72 cc Isovue - 300 . FINDINGS : Superoanterior mediastinal mass measuring approximately 4 cm which appears to be in contiguity with the left thyroid lobe . Limited evaluation of the pulmonary arteries secondary to the motion with poor visualization of the subsegmental pulmonary emboli . Evaluation of the pulmonary arteries demonstrates no filling defects to suggest pulmonary emboli of the main , lobar or proximal segmental pulmonary arteries . The main pulmonary arteries prominent in size measuring 3 . 8 cm . Ev

Processing Samples:  56%|█████▌    | 10/18 [05:31<04:10, 31.36s/it]

Report: EXAM : CT Pulmonary Embolism Chest Only DATE : 5 / 18 / 2017 7 : 08 PM CLINICAL HISTORY : Syncope COMPARISON : CT chest , abdomen and pelvis with contrast 2017 - 05 - 03 . CONTRAST : 83 cc Omnipaque 350 TECHNIQUE : Following uneventful administration of iodinated intravenous contrast media , multidetector helical CT data acquisition was performed through the thorax during pulmonary arterial phase of contrast . Data were reconstructed at 1 . 25 - mm and 5 - mm axial section thicknesses . Multiplanar reformatted , 3 - D volume rendered , and maximum intensity projection images were reviewed on an independent workstation . FINDINGS : There has been prior left mastectomy . There is re - demonstration of nodularity and skin irregularity involving the left breast region . Redemonstration of the right breast prosthesis . No interval change in multiple prominent left axillary lymph nodes . Evaluation of the pulmonary arterial tree demonstrates no intraluminal filling defects identified

Processing Samples:  61%|██████    | 11/18 [06:04<03:43, 31.89s/it]

Report: EXAM : CT Pulmonary Embolism Chest Only 5 / 18 / 2016 3 : 42 PM CLINICAL HISTORY : 28 - year - old female with postoperative shortness of breath and known pleural effusion referred to evaluate for pulmonary embolus . COMPARISON : None CONTRAST : 80 mL Isovue 370 TECHNIQUE : After the uneventful administration of IV contrast , contiguous axial images were acquired through the chest in the pulmonary arterial phase . Data set was reconstructed into multiple slice thicknesses and reformatted in the sagittal and coronal planes . 3 - D reformations were performed on an independent workstation . DOSE : Based on a 32 cm body phantom , the estimated radiation dose ( CTDIvol mGy ) for each series in this exam are : 1 . 4 , 4 . 2 , 12 . 1 mGy . The estimated cumulative dose ( DLP mGy - cm ) is : 333 mGy - cm . NOTE : The radiation dose indicators for CT -- the ' volume CT Dose Index ' ( CTDIvol ) given in milli - Gray ( mGy ), and the Dose Length Product ( DLP ) given in mGy - centimeters

Processing Samples:  67%|██████▋   | 12/18 [06:33<03:07, 31.17s/it]

Report: Addendum Begins Coronal and sagittal MPR reformations and maximum intensity projections were performed on an off - line 3D workstation . Addendum Ends CT PULMONARY EMBOLISM : 2001 / 04 / 08 . CLINICAL HISTORY : This is a 74 - year - old man with a history of rectal cancer status post preoperative XRT and chemotherapy , now status post abdominal and perineal resection with end - colostomy complicated by an NSTEMI . There is concern for pulmonary embolism . COMPARISON : PET / CT 01 - 03 - 25 . TECHNIQUE : 1 . 25 mm and 5 mm contiguous axial images were taken through the chest following the uneventful administration of intravenous contrast ( 120 cc of Omnipaque 350 ) . Timing was performed during the pulmonary arterial phase . FINDINGS : The main pulmonary artery and its branches were not optimally opacified , with maximum opacification of the main pulmonary artery of 215HU . While this study was slightly limited for evaluation of subsegmental pulmonary embolism , the segmental pu

Processing Samples:  72%|███████▏  | 13/18 [07:10<02:43, 32.77s/it]

Report: EXAM : CT Pulmonary Embolism Chest Only March 9 7 : 51 AM CLINICAL HISTORY : PEA arrest , concern for PE COMPARISON : 7 / 18 / 2005 CONTRAST : 75 cc of Isovue - 300 TECHNIQUE : After the uneventful administration of IV contrast , contiguous axial images were acquired through the thorax during the pulmonary arterial phase . Sagittal , coronal , maximum intensity projection , curved planar , and volume rendered reformations were performed on an independent 3 - D workstation . Based on a 32 cm body phantom , the estimated radiation dose ( CTDIvol mGy ) for each series in this exam are : 9 . 4 , 9 . 2 mGy . The estimated cumulative dose ( DLP mGy - cm ) is : 339 mGy - cm . NOTE : The radiation dose indicators for CT -- the ' volume CT Dose Index ' ( CTDIvol ) given in milli - Gray ( mGy ), and the Dose Length Product ( DLP ) given in mGy - centimeters ( mGy . cm ) -- are generated from the CT scanner to estimate radiation exposure based on technical study parameters and a reference

Processing Samples:  78%|███████▊  | 14/18 [07:51<02:20, 35.21s/it]

Report: CT Angio Thorax 5 - 17 - 2006 6 : 41 AM CLINICAL HISTORY : Gunshot victim COMPARISON : None CONTRAST : 150 mL Isovue 300 TECHNIQUE : Noncontrast CT images were acquired through the chest , abdomen , and pelvis . After the uneventful administration of IV contrast , contiguous CT images were acquired through the chest with retrospective cardiac gating . Additional CT images were acquired of the abdomen and pelvis in venous phase . Standard reformats were performed . Curved planar reformats , volume rendered , and maximum intensity projection images were performed on an independent 3 - D workstation . FINDINGS : Gunshot track # 1 : Tract enters the right anterior shoulder , causes a comminuted fracture of the proximal right humerus and humeral head . Tract continues anterior to the scapula with associated subcutaneous emphysema , before proceeding through the lateral aspect of the right third rib with associated fracturing . Tract generates lung parenchymal laceration / contusion 

Processing Samples:  83%|████████▎ | 15/18 [08:37<01:56, 38.72s/it]

Report: Addendum Begins *** Preliminary measurements were made during primary interpretation . The following protocol measurements were obtained in the 3D Laboratory on an independent workstation : **** **** Start of 3D Lab Measurements Report **** MRN : 246252373 Name : Mr Kyla French AccNO : 8513506 Scan Date : 2015 JUNE 27TH Protocol : Vessel Narrowing Measurements Diameters in mm Major Minor Orthogonal diameter at narrowing - proximal desc aorta , flow lumen 12 . 60 12 . 50 Orthogonal diameter just distal to narrowing - desc aorta 24 . 80 21 . 70 Outer wall of vessel at narrowing - proximal desc aorta 14 . 40 14 . 20 Protocol : Lesion Lesion1 diameter of os in mm 6 . 00 6 . 00 Lesion1 length 18 . 90 Lesion1 depth in mm 12 . 50 Aorta at . - ascending aorta , level of pseudoaneurysm 39 . 70 30 . 40 **** End of 3D Lab Measurements Report **** Addendum Ends CT ANGIOGRAM OF THE CHEST , ABDOMEN AND PELVIS : 6 / 27 / 2015 TECHNIQUE : Continuous noncontrast axial images of the chest were o

Processing Samples:  89%|████████▉ | 16/18 [09:23<01:21, 40.79s/it]

Report: EXAM : CT Pulmonary Embolism Chest Only DATE : 8 - 4 - 2019 5 : 32 PM CLINICAL HISTORY : Hypoxia , dyspnea on exertion in a patient with chest pain and hypotension . COMPARISON : Chest x - ray from earlier same day august 04th CONTRAST : 115 cc of Isovue - 300 TECHNIQUE : Following uneventful administration of iodinated intravenous contrast media , multidetector helical CT data acquisition was performed through the thorax during pulmonary arterial phase of contrast . Multiplanar reformatted , 3 - D volume rendered , and maximum intensity projection images were reviewed on an independent workstation . Based on a 32 cm body phantom , the estimated radiation dose ( CTDIvol mGy ) for each series in this exam are : 22 . 5 , 28 . 9 mGy . The estimated cumulative dose ( DLP mGy - cm ) is : 853 . 9 mGy - cm . NOTE : The radiation dose indicators for CT -- the ' volume CT Dose Index ' ( CTDIvol ) given in milli - Gray ( mGy ), and the Dose Length Product ( DLP ) given in mGy - centimete

Processing Samples:  94%|█████████▍| 17/18 [09:53<00:37, 37.48s/it]

Report: EXAM : CT Angio Thorax , 1 - 12 - 17 9 : 24 AM HISTORY : 25 years Male presents to the emergency department after a motor vehicle collision . COMPARISON : Chest radiograph 1 - 12 - 17 TECHNIQUE : After the uneventful administration of intravenous contrast , multiple contiguous axial CT images were acquired through the chest during the arterial phase after contrast administration . Coronal and sagittal reformatted images , as well as maximum intensity images were provided . CONTRAST : Omnipaque 350 , 120 mL . DOSE : Based on a 32 cm body phantom , the estimated radiation dose ( CTDIvol mGy ) for each series in this exam are : 32 . 83 , 16 . 83 , 18 . 19 , 8 . 93 mGy . The estimated cumulative dose ( DLP mGy - cm ) is : 1806 mGy - cm . NOTE : The radiation dose indicators for CT -- the ' volume CT Dose Index ' ( CTDIvol ) given in milli - Gray ( mGy ), and the Dose Length Product ( DLP ) given in mGy - centimeters ( mGy . cm ) -- are generated from the CT scanner to estimate radi

Processing Samples: 100%|██████████| 18/18 [10:26<00:00, 34.81s/it]


## Saving as Pretty JSON 

In [102]:
import json

INPUT_PATH = "/kaggle/working/chest-ct-schema-random20.json"
OUTPUT_PATH = "/kaggle/working/chest-ct-schema.pretty-random20.jsonl"

with open(INPUT_PATH, "r", encoding="utf-8") as fin, open(OUTPUT_PATH, "w", encoding="utf-8") as fout:
    for line in fin:
        line = line.strip()
        if not line:
            continue

        obj = json.loads(line)

        # her kaydı indent'li yaz
        fout.write(json.dumps(obj, ensure_ascii=False, indent=2))
        fout.write("\n\n")  # kayıtlar arası boşluk

print("Saved ->", OUTPUT_PATH)


Saved -> /kaggle/working/chest-ct-schema.pretty-random20.jsonl


In [103]:
# INPUT_PATH = "/kaggle/input/radgraph/stanford-radgraph-XL-sentence.jsonl"
# OUTPUT_PATH = "/kaggle/working/stanford-radgraph-XL-mapped.jsonl"

# dataset = load_jsonl(INPUT_PATH)[1:]

# results = run_inference_radgraph(
#     dataset=dataset,
#     output_path=OUTPUT_PATH,
#     model_id="deepseek",
#     model_name="deepseek-chat",
# )