In [1]:
# convert pdf to text file

import fitz  # PyMuPDF
import os

def pdf_to_text_file(pdf_path, output_txt_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    
    # Extract all text
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    
    # Save to a text file
    with open(output_txt_path, "w", encoding="utf-8") as f:
        f.write(full_text)
    
    doc.close()
    print(f"Text saved to: {output_txt_path}")

# Example usage
#doc 1
pdf_path = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/jsr-17-task-002_aiforhealthandhealthcare12122017.pdf"
output_txt_path = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/aiforhealthandhealthcare.txt"

#doc 2
#pdf_path = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/061104_1.pdf"
#output_txt_path = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/061104_1.txt"

#doc 3
#pdf_path = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/SIMM-5305-F-GenAI-Risk-Assessment-2025_0131-final.pdf"
#output_txt_path = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/SIMM-5305.txt"

#doc 4
#pdf_path = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/Ethical_Considerations.pdf"
#output_txt_path = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/Ethical_Considerations.txt"

pdf_to_text_file(pdf_path, output_txt_path)

Text saved to: /Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/aiforhealthandhealthcare.txt


In [2]:
# text file pre-processing function

import re

def preprocess_pdf_text(text):
    # Step 1: Remove page numbers (assuming they appear on a line by themselves)
    text = re.sub(r'\n\d+\n', '\n', text)

    # Step 2: Remove common headers and footers (heuristic)
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        if len(line.strip()) < 3:  # Skip short lines (e.g., single letters or digits)
            continue
        if re.match(r'^(Page|PAGE)\s*\d+', line.strip()):  # Page indicators
            continue
        cleaned_lines.append(line)
    text = '\n'.join(cleaned_lines)

    # Step 3: Fix hyphenated line breaks (e.g., "inter-\nview" → "interview")
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)

    # Step 4: Merge lines that are artificially split (end of line not a sentence end)
    # First, normalize line endings
    text = re.sub(r'\r\n?', '\n', text)

    # Then merge lines that are not paragraph breaks
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)  # convert single line breaks to spaces

    # Step 5: Normalize multiple newlines (paragraph breaks)
    text = re.sub(r'\n{2,}', '\n\n', text)

    # Step 6: Normalize whitespace
    text = re.sub(r'[ \t]+', ' ', text)  # collapse tabs and spaces
    text = re.sub(r' +\n', '\n', text)   # trim trailing spaces on lines
    text = text.strip()

    return text


In [3]:
# run text file pre-processing

#for doc 1
with open("/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/aiforhealthandhealthcare.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

clean_text = preprocess_pdf_text(raw_text)

with open("/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/cleaned_aiforhealthandhealthcare.txt", "w", encoding="utf-8") as f:
    f.write(clean_text)


#for doc 2
#with open("/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/061104_1.txt", "r", encoding="utf-8") as f:
#    raw_text = f.read()

#clean_text = preprocess_pdf_text(raw_text)

#with open("/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/cleaned_061104_1.txt", "w", encoding="utf-8") as f:
#    f.write(clean_text)

#for doc 3
#with open("/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/SIMM-5305.txt", "r", encoding="utf-8") as f:
#    raw_text = f.read()

#clean_text = preprocess_pdf_text(raw_text)

#with open("/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/cleaned_SIMM-5305.txt", "w", encoding="utf-8") as f:
#    f.write(clean_text)

#for doc 4
#with open("/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/Ethical_Considerations.txt", "r", encoding="utf-8") as f:
#    raw_text = f.read()

#clean_text = preprocess_pdf_text(raw_text)

#with open("/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/cleaned_Ethical_Considerations.txt", "w", encoding="utf-8") as f:
#    f.write(clean_text)


In [4]:
# chunk the document/text file

import tiktoken

def chunk_text(text, chunk_size=100, overlap=15):
    encoding = tiktoken.encoding_for_model("text-embedding-3-large")
    tokens = encoding.encode(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        decoded = encoding.decode(chunk)
        chunks.append(decoded)
    return chunks

# Run the chunking
chunks = chunk_text(clean_text)
print(f"Total Chunks: {len(chunks)}")


Total Chunks: 394


In [5]:
# category definitions

categories = {
    "Clinical Decision Support": (
        "Artificial intelligence systems that assist healthcare providers in clinical decision-making by analyzing patient data to generate diagnostic suggestions, treatment recommendations, and personalized care plans. These tools may use rules-based logic, machine learning, or probabilistic reasoning to help clinicians make timely and evidence-based decisions, especially in complex or high-risk scenarios."
    ),

    "Clinical Documentation AI": (
        "AI applications that streamline or automate the creation, management, or structuring of clinical documentation. This includes generating summary reports, transcribing and organizing clinical notes, auto-completing documentation during consultations, and extracting structured data from free-text clinical narratives. These systems reduce administrative burden, improve documentation quality, and integrate with electronic health records (EHRs)."
    ),

    "Medical Imaging AI": (
        "AI tools used to interpret, analyze, or enhance medical imaging data such as radiology scans, pathology slides, and ophthalmology images. These systems may perform tasks like detecting abnormalities (e.g., tumors, fractures), segmenting anatomical structures, quantifying lesions, or prioritizing imaging workflows. Techniques include convolutional neural networks (CNNs), image classification, and computer vision-based diagnostics."
    ),

    "Predictive Analytics": (
        "AI models that analyze historical and real-time clinical data to forecast future patient outcomes. These models are used for risk stratification, early warning systems, prediction of readmission or mortality, and estimating treatment effectiveness. Common data inputs include vitals, labs, medications, demographics, and medical history. Techniques may include regression, time-series analysis, and deep learning."
    ),

    "Operational and Administrative Automation": (
        "AI systems designed to optimize hospital operations and administrative workflows. This includes automating scheduling, resource allocation, billing, claims management, supply chain logistics, and staff workflow optimization. These systems improve efficiency, reduce human error, and enhance the management of hospital throughput and resources using algorithms and machine learning."
    ),

    "Patient-facing AI": (
        "AI applications that interact directly with patients to support health monitoring, self-care, education, or triage. These include chatbots and virtual assistants, symptom checkers, wearable device integrations, and remote monitoring platforms. They provide personalized guidance, collect patient-reported outcomes, and may flag concerning symptoms for provider review."
    ),

    "Robotics and Surgical AI": (
        "AI systems integrated into robotic platforms used in surgical or interventional procedures. These include robotic-assisted surgery systems for enhanced precision, AI-guided tools for minimally invasive techniques, and rehabilitation robotics used post-operatively. These systems combine real-time sensing, motion control, and decision support to improve surgical outcomes and reduce human error."
    ),

    "Education and Training AI": (
        "AI tools used in healthcare education and workforce development. This includes virtual patient simulations, intelligent tutoring systems, curriculum personalization tools, and performance analytics for medical trainees. These systems aim to enhance clinical reasoning skills, procedural knowledge, and continuing education using adaptive learning and natural language processing."
    ),

    "Research and Clinical Trial AI": (
        "AI platforms that support biomedical research and clinical trial operations. These tools assist with patient cohort identification, trial design optimization, natural language processing of scientific literature, data analysis, and drug discovery. They help accelerate evidence generation, hypothesis testing, and treatment development in both pre-clinical and clinical phases."
    ),

    "Public Health AI": (
        "AI systems used to analyze population-level health data to support public health policy and intervention. Applications include epidemiological modeling, outbreak detection, surveillance of disease trends, analysis of social determinants of health, and health equity analysis. These tools support proactive, data-driven public health strategies and real-time monitoring."
    )
}



In [6]:
# category examples

category_examples = {
    "Clinical Decision Support": [
        "Once validated, its use can be envisioned in a wide range of scenarios, including decision support in existing practice.",
        "Transparently disclose the use of GenAI and its role in decision making.",
        "AI suggests treatment plans for a patient based on medical history.",
        "Computerized clinical decision support systems (CCDSSs) provide patient-specific assessments or recommendations to clinicians to aid clinical decision making.",
        "Some systems suggest diagnostic tests, while others suggest treatments or preventative measures, but these should be carefully evaluated by the healthcare professional before usage."
    ],

    "Clinical Documentation AI": [
        "Electronic health record systems should integrate artificial intelligence to increase efficiency in simple documentation, allowing healthcare professionals to spend more time attending to patient needs.",
        "Generative AI usage for patient visit summaries should be approached with caution, due to risks of inaccurate information communication.",
        "GenAI-enabled clinical documentation tools assist in creating encounter summaries, generating draft notes, and extracting structured data from conversations."
    ],

    "Medical Imaging AI": [
        "Recently a transformational advance in automated retinal image analysis, using Deep Learning algorithms, has been demonstrated.",
        "Some CCDSSs include radiographic image interpretation tools that help clinicians identify abnormalities.",
        "Experts recommend using AI-assisted image analysis systems as a first consultation, deferring back to the radiologist for confirmation.",
        "Although traditional ML-based approaches dominate imaging, emerging GenAI models show promise in tasks such as image synthesis and labeling."
    ],

    "Predictive Analytics": [
        "A model trained to predict the likelihood of death from pneumonia assigned lower risk to patients with asthma, but only because such patients were treated as higher priority by the hospital.",
        "CCDSSs may calculate risk scores for conditions such as cardiovascular disease or osteoporosis, helping identify high-risk patients.",
        "Risk prediction tools integrate patient data to estimate the probability of future adverse health events."
    ],

    "Operational and Administrative Automation": [
        "AI software has been integrated into the organizational infrastructure of our hospital operations.",
        "Using AI, hospital workers' shift scheduling can be automated to reduce secretarial burdens.",
        "GenAI applications in administrative tasks include automating prior authorizations and summarizing payer policies."
    ],

    "Patient-facing AI": [
        "Kardia Mobile uses a finger pad and a smartphone app to record an EKG. The platform claims to use AI-enabled detection of atrial fibrillation.",
        "CloudUPDRS, an AI algorithm differentiates between actual tremors and bad data, enabling Parkinson's patients to perform in-home testing.",
        "There is a proliferation of companies developing apps that offer online doctors' appointments. Babylon claims to use an AI algorithm to automatically triage patients virtually.",
        "Chatbots powered by generative AI can respond to patient queries, schedule appointments, and assist in medication adherence.",
        "Concerns regarding the lack of verification in patient-facing AI tools suggest mitigation of such software in high-stakes situations."
    ],

    "Robotics and Surgical AI": [
        "AI-integrated surgical machines can help increase the precision in difficult medical procedures.",
        "In post-op physical therapy sessions, machines integrated with a patient's progress and data can help provide the appropriate training.",
        "If the AI pacemaker fails, the responsibility to which this falls upon remains ambiguous."
    ],

    "Education and Training AI": [
        "All work completed by medical students should be conducted without the usage of artifical intelligence.",
        "GenAI has potential in medical training, for example, simulating patient interactions or generating test questions tailored to learning goals.",
        "Hospitals should integrate AI into staff trainings because it provides a customized experience."
    ],

    "Research and Clinical Trial AI": [
        "We need your help to take personalized medicine to its full potential, and develop a Machine Learning algorithm that automatically classifies genetic variations.",
        "It may support clinical trial design by generating hypotheses, summarizing relevant literature, or analyzing eligibility criteria from EHR data.",
        "Using artificial intelligence in research raises concerns about fake data, which can have lasting implications on medical practices if not verified."
    ],

    "Public Health AI": [
        "Support the development of wearable devices for the sensing of environmental toxins and broad-based pathogen sensing for rural and urban environments. The collected data can inform policies for mitigating risks of pandemics and epidemics.",
        "Providing access to data captured by mHealth apps and devices could enhance the research community's ability to build more insights into public health through AI.",
        "GenAI could aid public health monitoring by extracting trends from social media, news sources, or reports, enhancing situational awareness."
    ]
}


In [7]:
from openai import OpenAI, AzureOpenAI
import json
import pandas as pd
import regex as re
from dotenv import load_dotenv
import os

load_dotenv()  # Load from .env file

AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT_embeddings")
AZURE_API_KEY = os.getenv("AZURE_API_KEY_embeddings")
AZURE_API_VERSION = "2025-04-01-preview"

openai = AzureOpenAI(
    azure_endpoint=AZURE_ENDPOINT,
    api_key=AZURE_API_KEY,
    api_version=AZURE_API_VERSION
)

In [8]:
# embedding function

def get_embedding(text: str, model="embedding3large"):
    response = openai.embeddings.create(
        model=model,
        input=text
    )
    return response.data[0].embedding


In [9]:
# create category def+ex embeddings (so output is just one embedding per category)
# just do one time

category_all_embeddings = {}

for label in categories:
    combined_text = categories[label] + " " + " ".join(category_examples.get(label, []))
    category_all_embeddings[label] = get_embedding(combined_text)

print(category_all_embeddings)

{'Clinical Decision Support': [-0.0021021636202931404, 0.01540796272456646, -0.023064512759447098, -0.008368000388145447, 0.030355175957083702, 0.00024858381948433816, 0.03279443085193634, -0.009865432046353817, 0.02244114875793457, 0.011423845775425434, -0.001696468098089099, -0.03748322278261185, -0.021167315542697906, 0.027943024411797523, 0.010183890350162983, -0.00633867597207427, -0.029542090371251106, 0.01735936664044857, 0.017725255340337753, -0.046427156776189804, -0.020015444606542587, 0.0117423040792346, 0.03084302693605423, 0.017210301011800766, 0.0348813496530056, 0.00358773791231215, 0.009208188392221928, -0.012236930429935455, -0.05369071662425995, 0.029731811955571175, 0.011268003843724728, -0.0033404245041310787, -0.0007626907317899168, -0.08027859777212143, -0.014405157417058945, -0.0027272228617221117, -0.017318712547421455, 0.006623255554586649, 0.01520469132810831, 0.026316853240132332, 0.03298415243625641, -0.023742083460092545, 0.0056509412825107574, -0.008462860

In [10]:
# create embeddings for each document chunk
chunk_embeddings = []

for idx, chunk in enumerate(chunks):
    embedding = get_embedding(chunk)
    chunk_embeddings.append((idx, chunk, embedding))  # store index, text, and vector

print(f"Created embeddings for {len(chunk_embeddings)} chunks.")

Created embeddings for 394 chunks.


In [11]:
# cosine similarity and comparison functions

import numpy as np
from collections import defaultdict

def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def classify_with_combined_embeddings(chunk_embeddings, category_all_embeddings, threshold):
    classification = defaultdict(list)

    for idx, text, chunk_embedding in chunk_embeddings:
        for category, cat_embedding in category_all_embeddings.items():
            sim = cosine_similarity(chunk_embedding, cat_embedding)
            if sim >= threshold:
                classification[category].append((idx, sim, text))
    return classification


In [12]:
results = classify_with_combined_embeddings(chunk_embeddings, category_all_embeddings, threshold=0.6)

In [13]:
# visualize the cosine similarity by chunk and category

import pandas as pd

rows = []
for category, matches in results.items():
    for idx, sim, preview in matches:
        rows.append({
            "Category": category,
            "Chunk Index": idx,
            "Similarity": round(sim, 4),
            "Chunk Preview": preview
        })

df = pd.DataFrame(rows)
print(df.head())  # or display full table

                    Category  Chunk Index  Similarity  \
0  Clinical Decision Support            7      0.6493   
1         Medical Imaging AI            7      0.6066   
2         Medical Imaging AI           33      0.6588   
3         Medical Imaging AI           34      0.6437   
4         Medical Imaging AI           54      0.6258   

                                       Chunk Preview  
0   on how computer-based decision procedures, un...  
1   on how computer-based decision procedures, un...  
2   nearly every domain of science and engineerin...  
3   demonstrated that AI can perform clinical dia...  
4   expectations, which is that AI algorithms, in...  


In [14]:
# visualize the cosine similarity by chunk and category (show full text)
for label, matches in results.items():
    print(f"\n=== {label} ({len(matches)} matching chunks) ===")
    for idx, sim, full_text in matches:
        print(f"[Chunk {idx}] (similarity={sim:.2f}):\n{full_text}\n")


=== Clinical Decision Support (1 matching chunks) ===
[Chunk 7] (similarity=0.65):
 on how computer-based decision procedures, under the broad umbrella of artificial intelligence (AI), can assist in improving health and health care. Although advanced statistics and machine learning provide the foundation for AI, there are currently revolutionary advances underway in the sub-field of neural networks. This has created tremendous excitement in many fields of science, including in medicine and public health. First demonstrations have already emerged showing that deep neural networks can perform as well as the best human clinicians in well-defined diagnostic tasks. In addition, AI-based


=== Medical Imaging AI (6 matching chunks) ===
[Chunk 7] (similarity=0.61):
 on how computer-based decision procedures, under the broad umbrella of artificial intelligence (AI), can assist in improving health and health care. Although advanced statistics and machine learning provide the foundation for AI,

In [15]:
def compute_document_level_similarity(results, category_all_embeddings, threshold=0.6):
    document_scores = {}

    for category, matches in results.items():
        if matches:
            # Average cosine similarity of all matched chunks in this category
            sims = [sim for _, sim, _ in matches]
            avg_sim = sum(sims) / len(sims)
            document_scores[category] = avg_sim
        else:
            document_scores[category] = 0.0  # No matches → 0 similarity

    # Filter categories above threshold
    classified_categories = {
        category: score
        for category, score in document_scores.items()
        if score >= threshold
    }

    return document_scores, classified_categories


In [16]:
document_scores, classified_categories = compute_document_level_similarity(results, category_all_embeddings, threshold=0.6)

# Print raw scores
for category, score in document_scores.items():
    print(f"{category}: average similarity = {score:.3f}")

# Final classification
print("\nDocument classified under:")
for category in classified_categories:
    print(f"✓ {category}")


Clinical Decision Support: average similarity = 0.649
Medical Imaging AI: average similarity = 0.642
Operational and Administrative Automation: average similarity = 0.601
Education and Training AI: average similarity = 0.612
Research and Clinical Trial AI: average similarity = 0.618
Public Health AI: average similarity = 0.633
Patient-facing AI: average similarity = 0.656

Document classified under:
✓ Clinical Decision Support
✓ Medical Imaging AI
✓ Operational and Administrative Automation
✓ Education and Training AI
✓ Research and Clinical Trial AI
✓ Public Health AI
✓ Patient-facing AI
