# First cleaning our medicalknowlege base

In [23]:
import os

def get_dataset_paths():
    """Finds the absolute paths for the dataset folders."""
    current_dir = os.getcwd()
    print(f"Searching for dataset root from: {current_dir}")
    
    # We are looking for 'mimic-iv-ext-direct-1.0.0' which contains 'medicalKnowledgeBase'
    # Check current directory and parents
    check_path = current_dir
    dataset_root = None
    
    for _ in range(5): # Check up to 5 levels up
        # Case 1: We are inside mimic-iv-ext-direct-1.0.0
        if os.path.exists(os.path.join(check_path, "medicalKnowledgeBase")):
            dataset_root = check_path
            break
            
        # Case 2: We are in the parent of mimic-iv-ext-direct-1.0.0
        if os.path.exists(os.path.join(check_path, "mimic-iv-ext-direct-1.0.0", "medicalKnowledgeBase")):
            dataset_root = os.path.join(check_path, "mimic-iv-ext-direct-1.0.0")
            break
            
        parent = os.path.dirname(check_path)
        if parent == check_path:
            break
        check_path = parent
    
    if dataset_root:
        print(f"Dataset root found at: {dataset_root}")
        return {
            "root": dataset_root,
            "kb": os.path.join(dataset_root, "medicalKnowledgeBase", "Diagnosis_flowchart"),
            "patient_cases": os.path.join(dataset_root, "patient_cases"),
            "finished": os.path.join(dataset_root, "Finished"),
            "output_processed": os.path.join(dataset_root, "patient_cases_processed.json"),
            "output_combined": os.path.join(dataset_root, "combined_rag_data.json")
        }
    else:
        print("ERROR: Could not find dataset root containing 'medicalKnowledgeBase'.")
        return None

paths = get_dataset_paths()

Searching for dataset root from: c:\Users\Muhammad Abu Huraira\Documents\Assignments and Submissions\Semester 7\NLP\A04\NLP_Project04_RAG\mimic-iv-ext-direct-1.0.0\My_dataset
Dataset root found at: c:\Users\Muhammad Abu Huraira\Documents\Assignments and Submissions\Semester 7\NLP\A04\NLP_Project04_RAG\mimic-iv-ext-direct-1.0.0


In [24]:
import os
import json

if paths:
    folder_path = paths["kb"]
    print(f"Reading KB from: {folder_path}")
    
    all_chunks = []

    def Convertorfunction(data, parent_key=""):
        items = []
        for key, value in data.items():
            new_key = f"{parent_key}/{key}" if parent_key else key

            if isinstance(value, str):
                items.append({"medicalKB": f"{new_key}: {value}"})

            elif isinstance(value, list):
                items.append({"medicalKB": f"{new_key}: []"})

            elif isinstance(value, dict):
                items.extend(Convertorfunction(value, new_key))

        return items

    if os.path.exists(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".json"):
                file_path = os.path.join(folder_path, file_name)

                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)

                all_chunks.extend(Convertorfunction(data))

        # --- Save output ---
        # Saving to the dataset root for consistency
        # output_path = os.path.join(paths["root"], "ragFile.json")
        # with open(output_path, "w", encoding="utf-8") as f:
        #     json.dump(all_chunks, f, indent=2, ensure_ascii=False)

        print(f"Done! Processed {len(all_chunks)} chunks.")
    else:
        print(f"Folder not found: {folder_path}")
else:
    print("Paths not initialized. Run the first cell.")

Reading KB from: c:\Users\Muhammad Abu Huraira\Documents\Assignments and Submissions\Semester 7\NLP\A04\NLP_Project04_RAG\mimic-iv-ext-direct-1.0.0\medicalKnowledgeBase\Diagnosis_flowchart
Done! Processed 192 chunks.


In [25]:
import os
import json

if paths:
    folder_path = paths["patient_cases"]
    output_file = paths["output_processed"]

    print(f"Scanning directory: {folder_path}")

    processed_cases = []

    def process_patient_case(file_path, file_name, disease_group, specific_disease=None):
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError:
                print(f"Error decoding JSON: {file_path}")
                return None

        inputs = {}
        reasoning = {}
        
        # Separate inputs and reasoning
        for key, value in data.items():
            if key.lower().startswith("input"):
                inputs[key] = value
            else:
                reasoning[key] = value
                
        # Clean inputs
        cleaned_inputs = {}
        # Standardize to input1..input6
        for i in range(1, 7):
            key = f"input{i}"
            val = None
            for k in inputs:
                if k.lower() == key:
                    val = inputs[k]
                    break
            
            if not val or (isinstance(val, str) and val.strip() == ""):
                val = "NA"
            
            if isinstance(val, str):
                cleaned_inputs[key] = val.strip()
            else:
                cleaned_inputs[key] = val

        case_entry = {
            "file_name": file_name,
            "disease_group": disease_group,
            "specific_disease": specific_disease if specific_disease else "NA",
            "reasoning": reasoning,
            "inputs": cleaned_inputs
        }
        
        return case_entry

    # Walk through the directory
    if not os.path.exists(folder_path):
        print(f"Error: Directory not found: {folder_path}")
    else:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                if file.endswith(".json"):
                    file_path = os.path.join(root, file)
                    
                    rel_path = os.path.relpath(root, folder_path)
                    path_parts = rel_path.split(os.sep)
                    
                    # path_parts[0] should be 'Finished'
                    if len(path_parts) > 1 and path_parts[0] == "Finished":
                        disease_group = path_parts[1]
                        specific_disease = path_parts[2] if len(path_parts) > 2 else None
                        
                        case_data = process_patient_case(file_path, file, disease_group, specific_disease)
                        if case_data:
                            processed_cases.append(case_data)

        # Save the processed data
        # with open(output_file, "w", encoding="utf-8") as f:
        #     json.dump(processed_cases, f, indent=2, ensure_ascii=False)

        print(f"Processed {len(processed_cases)} cases.")
else:
    print("Paths not initialized.")

Scanning directory: c:\Users\Muhammad Abu Huraira\Documents\Assignments and Submissions\Semester 7\NLP\A04\NLP_Project04_RAG\mimic-iv-ext-direct-1.0.0\patient_cases
Processed 511 cases.


# Combined Output: Knowledge Base + Patient Cases
This cell creates a single JSON file with:
- Upper part: Medical Knowledge Base entries
- Lower part: Patient case studies with reasoning flattened to a single string

In [26]:
import os
import json
import re

if paths:
    kb_folder_path = paths["kb"]
    # Note: Using 'Finished' folder directly if that's where the cases are for the final combo
    # Or we can use the 'patient_cases' folder if that's preferred. 
    # Based on previous code, it seems to look for 'Finished' inside 'patient_cases' or as a sibling.
    # Let's use the 'Finished' path we found.
    patient_cases_folder = paths["finished"]
    output_file = paths["output_combined"]

    print(f"Knowledge Base Path: {kb_folder_path}")
    print(f"Patient Cases Path: {patient_cases_folder}")

    # Function to clean anonymization placeholders (___) from text
    def clean_text(text):
        if not isinstance(text, str):
            return text
        # Remove standalone ___ (with optional surrounding spaces)
        cleaned = re.sub(r'\s*___\s*', ' ', text)
        # Clean up multiple spaces
        cleaned = re.sub(r' +', ' ', cleaned)
        # Clean up spaces before punctuation
        cleaned = re.sub(r' +([,.:;])', r'\1', cleaned)
        return cleaned.strip()

    # Counter for generating unique IDs
    kb_id_counter = 0

    def Convertorfunction(data, parent_key=""):
        global kb_id_counter
        items = []
        
        # Check if this node is a "concept" with attributes (dictionary of strings)
        all_values_are_strings = True
        has_string = False
        for k, v in data.items():
            if isinstance(v, (dict, list)):
                if isinstance(v, list) and len(v) == 0:
                    continue
                all_values_are_strings = False
                break
            if isinstance(v, str):
                has_string = True
                
        if all_values_are_strings and has_string:
            # It's a concept node. Capture the whole dictionary as the knowledge.
            kb_id_counter += 1
            knowledge_dict = {}
            for k, v in data.items():
                if isinstance(v, str):
                    knowledge_dict[k] = v
            
            items.append({
                "id": f"KB_{kb_id_counter:04d}",
                "topic": parent_key,
                "knowledge": knowledge_dict
            })
            return items

        # Standard recursion / handling
        for key, value in data.items():
            new_key = f"{parent_key}/{key}" if parent_key else key

            if isinstance(value, str):
                kb_id_counter += 1
                items.append({
                    "id": f"KB_{kb_id_counter:04d}",
                    "topic": new_key,
                    "knowledge": value
                })
            elif isinstance(value, list):
                # Skip empty lists to avoid empty knowledge entries
                if len(value) > 0:
                    kb_id_counter += 1
                    items.append({
                        "id": f"KB_{kb_id_counter:04d}",
                        "topic": new_key,
                        "knowledge": value
                    })
            elif isinstance(value, dict):
                items.extend(Convertorfunction(value, new_key))
                
        return items

    knowledge_base_entries = []

    if os.path.exists(kb_folder_path):
        for file_name in os.listdir(kb_folder_path):
            if file_name.endswith(".json"):
                file_path = os.path.join(kb_folder_path, file_name)
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                knowledge_base_entries.extend(Convertorfunction(data))
        print(f"Processed {len(knowledge_base_entries)} knowledge base entries")
    else:
        print(f"Warning: Knowledge base folder not found at {kb_folder_path}")

    def flatten_reasoning(obj, prefix=""):
        parts = []
        if isinstance(obj, dict):
            for key, value in obj.items():
                new_prefix = f"{prefix} -> {key}" if prefix else key
                if isinstance(value, dict) and value:  # Non-empty dict
                    parts.append(flatten_reasoning(value, new_prefix))
                else:
                    parts.append(new_prefix)
        return " | ".join(filter(None, parts))

    def process_patient_case(file_path, file_name, disease_group, specific_disease=None):
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError:
                print(f"Error decoding JSON: {file_path}")
                return None

        inputs = {}
        reasoning = {}
        for key, value in data.items():
            if key.lower().startswith("input"):
                inputs[key] = value
            else:
                reasoning[key] = value

        cleaned_inputs = {}
        for i in range(1, 7):
            key = f"input{i}"
            val = None
            for k in inputs:
                if k.lower() == key:
                    val = inputs[k]
                    break
            
            if not val or (isinstance(val, str) and val.strip() == ""):
                val = "NA"
            
            if isinstance(val, str):
                # Clean the ___ placeholders from input text
                cleaned_inputs[key] = clean_text(val.strip())
            else:
                cleaned_inputs[key] = val

        reasoning_string = flatten_reasoning(reasoning)
        
        # Use filename (without .json extension) as the ID
        case_id = file_name.replace(".json", "")
        
        case_entry = {
            "id": case_id,
            "patient_case": {
                "disease_group": disease_group,
                "specific_disease": specific_disease if specific_disease else "NA",
                "reasoning": reasoning_string,
                "inputs": cleaned_inputs
            }
        }
        
        return case_entry

    patient_cases_entries = []

    if os.path.exists(patient_cases_folder):
        for root, dirs, files in os.walk(patient_cases_folder):
            for file in files:
                if file.endswith(".json"):
                    file_path = os.path.join(root, file)
                    
                    rel_path = os.path.relpath(root, patient_cases_folder)
                    path_parts = rel_path.split(os.sep)
                    
                    # Get disease group and specific disease from folder structure
                    if len(path_parts) >= 1 and path_parts[0] != ".":
                        disease_group = path_parts[0]
                        specific_disease = path_parts[1] if len(path_parts) > 1 else None
                        
                        case_data = process_patient_case(file_path, file, disease_group, specific_disease)
                        if case_data:
                            patient_cases_entries.append(case_data)
        print(f"Processed {len(patient_cases_entries)} patient cases")
    else:
        print(f"Warning: Patient cases folder not found at {patient_cases_folder}")

    # Combine knowledge base entries (upper part) with patient cases (lower part)
    combined_data = knowledge_base_entries + patient_cases_entries

    # Save the combined output
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(combined_data, f, indent=2, ensure_ascii=False)

    print(f"\n=== Summary ===")
    print(f"Total Knowledge Base entries: {len(knowledge_base_entries)} (IDs: KB_0001 to KB_{kb_id_counter:04d})")
    print(f"Total Patient Cases: {len(patient_cases_entries)}")
    print(f"Combined entries: {len(combined_data)}")
    print(f"Saved to: {output_file}")
else:
    print("Paths not initialized.")

Knowledge Base Path: c:\Users\Muhammad Abu Huraira\Documents\Assignments and Submissions\Semester 7\NLP\A04\NLP_Project04_RAG\mimic-iv-ext-direct-1.0.0\medicalKnowledgeBase\Diagnosis_flowchart
Patient Cases Path: c:\Users\Muhammad Abu Huraira\Documents\Assignments and Submissions\Semester 7\NLP\A04\NLP_Project04_RAG\mimic-iv-ext-direct-1.0.0\Finished
Processed 96 knowledge base entries
Processed 511 patient cases

=== Summary ===
Total Knowledge Base entries: 96 (IDs: KB_0001 to KB_0096)
Total Patient Cases: 511
Combined entries: 607
Saved to: c:\Users\Muhammad Abu Huraira\Documents\Assignments and Submissions\Semester 7\NLP\A04\NLP_Project04_RAG\mimic-iv-ext-direct-1.0.0\combined_rag_data.json
