In [None]:
# NER
# severity:
# absence:
# presence: 
# anatomy: 
# observation

# anatomical
# relations
 


In [None]:
# https://stanfordmlgroup.github.io/competitions/chexpert/
# https://huggingface.co/Angelakeke/RaTE-NER-Deberta

In [2]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import (
    Tokenizer,
    SentenceDetector,
    BertEmbeddings
)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
import pandas as pd

class MedicalNERPipeline:
    def __init__(self, model_name="emilyalsentzer/Bio_ClinicalBERT"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(model_name)
        self.nlp = pipeline(
            "ner",
            model=self.model,
            tokenizer=self.tokenizer,
            aggregation_strategy="simple"
        )
    
    def process_text(self, text):
        """Process a single text through the NER pipeline"""
        entities = self.nlp(text)
        return entities

    def process_batch(self, texts):
        """Process a batch of texts"""
        results = []
        for text in texts:
            entities = self.process_text(text)
            results.append(entities)
        return results

def process_radiology_reports(reports):
    """
    Process a collection of radiology reports using Bio_ClinicalBERT
    """
    # Initialize pipeline
    ner_pipeline = MedicalNERPipeline()
    
    # Process reports
    results = ner_pipeline.process_batch(reports)
    
    # Extract findings and organize results
    organized_results = []
    for report, entities in zip(reports, results):
        findings = [
            {
                'text': entity['word'],
                'label': entity['entity_group'],
                'score': entity['score'],
                'start': entity['start'],
                'end': entity['end']
            }
            for entity in entities
        ]
        organized_results.append({
            'report': report,
            'findings': findings,
        })
    
    return organized_results

if __name__ == "__main__":
    reports = [
        """Chest radiograph demonstrates bilateral lower lobe infiltrates 
        with small pleural effusions. No pneumothorax identified.""",
        """Heart size is normal. Lungs are clear without focal consolidation. 
        No pleural effusion or pneumothorax."""
    ]
    
    # Process reports
    results = process_radiology_reports(reports)
    
    # Print findings
    for result in results:
        print("\nReport:", result['report'][:50], "...")
        print("Findings:")
        for finding in result['findings']:
            print(f"- {finding['text']} ({finding['label']}: {finding['score']:.3f})")