In [1]:
# convert pdf to text file

import fitz  # PyMuPDF
import os

def pdf_to_text_file(pdf_path, output_txt_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    
    # Extract all text
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    
    # Save to a text file
    with open(output_txt_path, "w", encoding="utf-8") as f:
        f.write(full_text)
    
    doc.close()
    print(f"Text saved to: {output_txt_path}")

# Example usage
pdf_path = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/061104_1.pdf"
output_txt_path = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/061104_1.txt"

pdf_to_text_file(pdf_path, output_txt_path)

Text saved to: /Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/061104_1.txt


In [2]:
# text pre-processing

import re

def preprocess_pdf_text(text):
    # Step 1: Remove page numbers (assuming they appear on a line by themselves)
    text = re.sub(r'\n\d+\n', '\n', text)

    # Step 2: Remove common headers and footers (heuristic)
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        if len(line.strip()) < 3:  # Skip short lines (e.g., single letters or digits)
            continue
        if re.match(r'^(Page|PAGE)\s*\d+', line.strip()):  # Page indicators
            continue
        cleaned_lines.append(line)
    text = '\n'.join(cleaned_lines)

    # Step 3: Fix hyphenated line breaks (e.g., "inter-\nview" → "interview")
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)

    # Step 4: Merge lines that are artificially split (end of line not a sentence end)
    # First, normalize line endings
    text = re.sub(r'\r\n?', '\n', text)

    # Then merge lines that are not paragraph breaks
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)  # convert single line breaks to spaces

    # Step 5: Normalize multiple newlines (paragraph breaks)
    text = re.sub(r'\n{2,}', '\n\n', text)

    # Step 6: Normalize whitespace
    text = re.sub(r'[ \t]+', ' ', text)  # collapse tabs and spaces
    text = re.sub(r' +\n', '\n', text)   # trim trailing spaces on lines
    text = text.strip()

    return text


In [3]:
with open("/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/061104_1.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

clean_text = preprocess_pdf_text(raw_text)

with open("/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/cleaned_061104_1.txt", "w", encoding="utf-8") as f:
    f.write(clean_text)


In [4]:
import tiktoken

def chunk_text(text, chunk_size=200, overlap=50):
    encoding = tiktoken.encoding_for_model("text-embedding-3-large")
    tokens = encoding.encode(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        decoded = encoding.decode(chunk)
        chunks.append(decoded)
    return chunks

# Run the chunking
chunks = chunk_text(clean_text)
print(f"Total Chunks: {len(chunks)}")


Total Chunks: 147


In [None]:
categories = {
    "Clinical Decision Support": (
        "AI used by healthcare providers for their decision making. Topics include: Diagnostic Support, Treatment Recommendations, and Personalized Care Plans."
    ),
    "Clinical Documentation AI": (
        "AI used in the automation of documentation tasks. Topics include: Summary Reports, Clinical Notes, Auto-documentation, and Structured Data Extraction from Free Text."
    ),
    "Medical Imaging AI": (
        "AI used in the evaluation of medical imaging outputs. Topics include: Radiology, Pathology, and Ophthalmology."
    ),
    "Predictive Analytics": (
        "AI used in the prediction of medical outcomes. Topics include: Risk Stratification, Early Warning Systems, Readmission Prediction, Mortality Prediction, and Treatment Effectiveness Prediction."
    ),
    "Operational and Administrative Automation": (
        "AI used in administrative tasks. Topics include: Scheduling Optimization, Resource Allocation, Revenue Cycle Management, Supply Chain, and Workflow Optimization."
    ),
    "Patient-facing AI": (
        "AI used directly by patients for monitoring or education. Topics include: Chatbots/Virtual Assistants, Symptom Checkers, Patient Education Tools, and Wearable/Remote Monitoring Assistants."
    ),
    "Robotics and Surgical AI": (
        "AI used in robotic-assisted procedures. Topics include: Robotic-assisted surgeries, Precision tools, and Rehabilitation robotics."
    ),
    "Education and Training AI": (
        "AI used for medical education and staff training. Topics include: AI-based simulations, Virtual patients, and Curriculum personalization tools."
    ),
    "Research and Clinical Trial AI": (
        "AI used in medical research. Topics include: Patient Recruitment, Trial Design, Data Analysis, and Drug Discovery."
    ),
    "Public Health AI": (
        "AI used to monitor and analyze population health. Topics include: Epidemiological Modeling, Public Health Surveillance, and Health Equity Analysis."
    )
}


In [6]:
from openai import OpenAI, AzureOpenAI
import json
import pandas as pd
import regex as re
from dotenv import load_dotenv
import os

load_dotenv()  # Load from .env file

AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT_embeddings")
AZURE_API_KEY = os.getenv("AZURE_API_KEY_embeddings")
AZURE_API_VERSION = "2025-04-01-preview"

openai = AzureOpenAI(
    azure_endpoint=AZURE_ENDPOINT,
    api_key=AZURE_API_KEY,
    api_version=AZURE_API_VERSION
)

In [8]:
def get_embedding(text: str, model="embedding3large"):
    response = openai.embeddings.create(
        model=model,
        input=text
    )
    return response.data[0].embedding


# Create embeddings for each category
category_embeddings = {label: get_embedding(desc) for label, desc in categories.items()}


In [9]:
for category, embedding in category_embeddings.items():
    print(f"{category}: {len(embedding)} dimensions")

for category, embedding in category_embeddings.items():
    print(f"\n{category}:")
    print(embedding[:10])  # print only the first 10 numbers


Clinical Decision Support: 3072 dimensions
Clinical Documentation AI: 3072 dimensions
Medical Imaging AI: 3072 dimensions
Predictive Analytics: 3072 dimensions
Operational and Administrative Automation: 3072 dimensions
Patient-facing AI: 3072 dimensions
Robotics and Surgical AI: 3072 dimensions
Education and Training AI: 3072 dimensions
Research and Clinical Trial AI: 3072 dimensions
Public Health AI: 3072 dimensions

Clinical Decision Support:
[-0.009869870729744434, 0.01331905648112297, -0.021285202354192734, -0.03554558381438255, 0.013571950607001781, 0.007853747345507145, 0.015749644488096237, -0.004632868338376284, 0.02700340375304222, 0.00822606310248375]

Clinical Documentation AI:
[-0.0009404133888892829, -0.01200233306735754, -0.02751505747437477, -0.02618955634534359, 0.03326860070228577, 0.022650033235549927, -0.0018362185219302773, 0.003503108164295554, -0.002281390130519867, 0.021091477945446968]

Medical Imaging AI:
[-0.018428850919008255, 0.021242313086986542, -0.0284038

In [None]:
#combined text embedding with cosine similarity comparison
import numpy as np
from collections import defaultdict

def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def classify_chunks(chunks, category_embeddings, threshold=0.75):
    classification = defaultdict(list)

    for idx, chunk in enumerate(chunks):
        chunk_embedding = get_embedding(chunk)
        for label, cat_embedding in category_embeddings.items():
            sim = cosine_similarity(chunk_embedding, cat_embedding)
            if sim >= threshold:
                classification[label].append((idx, sim, chunk[:300]))
    return classification

# Classify
results = classify_chunks(chunks, category_embeddings, threshold=0.5)

print("Classification complete.")
print(f"Number of categories with matches: {len(results)}")



Classification complete.
Number of categories with matches: 10


In [None]:
# to visualize the cosine similarity by chunk and category
for category, matches in results.items():
    print(f"\n=== {category} ({len(matches)} matching chunks) ===")
    for idx, sim, preview in matches:
        print(f"[Chunk {idx}] (similarity={sim:.2f}): {preview}...\n")


In [None]:
# alternative way to visualize the cosine similarity by chunk and category

import pandas as pd

rows = []
for category, matches in results.items():
    for idx, sim, preview in matches:
        rows.append({
            "Category": category,
            "Chunk Index": idx,
            "Similarity": round(sim, 4),
            "Chunk Preview": preview
        })

df = pd.DataFrame(rows)
print(df.head())  # or display full table


In [10]:
# Step 1: Generate embeddings for each chunk
chunk_embeddings = []

for idx, chunk in enumerate(chunks):
    embedding = get_embedding(chunk)
    chunk_embeddings.append((idx, chunk, embedding))  # store index, text, and vector

print(f"Created embeddings for {len(chunk_embeddings)} chunks.")


Created embeddings for 147 chunks.


In [24]:
import numpy as np
from collections import defaultdict

def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def classify_from_embeddings(chunk_embeddings, category_embeddings, threshold=0.75):
    classification = defaultdict(list)

    for idx, text, chunk_embedding in chunk_embeddings:
        for label, cat_embedding in category_embeddings.items():
            sim = cosine_similarity(chunk_embedding, cat_embedding)
            if sim >= threshold:
                classification[label].append((idx, sim, text[:300]))  # preview
    return classification

# Run classification
results = classify_from_embeddings(chunk_embeddings, category_embeddings, threshold=0.60)


In [25]:
# alternative way to visualize the cosine similarity by chunk and category

import pandas as pd

rows = []
for category, matches in results.items():
    for idx, sim, preview in matches:
        rows.append({
            "Category": category,
            "Chunk Index": idx,
            "Similarity": round(sim, 4),
            "Chunk Preview": preview
        })

df = pd.DataFrame(rows)
print(df.head())  # or display full table

               Category  Chunk Index  Similarity  \
0    Medical Imaging AI            2      0.6020   
1    Medical Imaging AI            5      0.6668   
2  Predictive Analytics            5      0.6003   

                                       Chunk Preview  
0   in developing medical imaging-based machine l...  
1  AI/ML) in medical imaging provide important me...  
2  AI/ML) in medical imaging provide important me...  


In [26]:
# to visualize the cosine similarity by chunk and category
for label, matches in results.items():
    print(f"\n=== {label} ({len(matches)} matching chunks) ===")
    for idx, sim, preview in matches:
        print(f"[Chunk {idx}] (sim={sim:.2f}): {preview[:200]}...\n")



=== Medical Imaging AI (2 matching chunks) ===
[Chunk 2] (sim=0.60):  in developing medical imaging-based machine learning methods, also known as medical imaging artificial intelligence (AI), for the detection, diagnosis, prognosis, and risk assessment of disease with ...

[Chunk 5] (sim=0.67): AI/ML) in medical imaging provide important methods for leveraging large amounts of data to build models to detect disease and provide diagnosis, prognosis, and risk assessment tools to support decisi...


=== Predictive Analytics (1 matching chunks) ===
[Chunk 5] (sim=0.60): AI/ML) in medical imaging provide important methods for leveraging large amounts of data to build models to detect disease and provide diagnosis, prognosis, and risk assessment tools to support decisi...

