First pass

In [1]:
import pandas as pd
import requests
import os
import re
from PyPDF2 import PdfReader

# === CONFIGURATION ===
CSV_PATH = "/Users/winnie/Documents/GitHub/MedAI/classify_by_state/sampled_20_rows.csv"  # Path to your CSV
OUTPUT_PATH = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/pdf_ai_categories.csv"  # Output file

# Define category keywords
category_keywords = {
    "Predictive AI": [r"\bpredictive\b", r"\brisk prediction\b", r"\bforecasting\b", r"\bearly warning\b"],
    "Generative AI": [r"\bgenerative\b", r"\bchatbot\b", r"\btext generation\b", r"\bsynthetic data\b"],
    "Prescriptive AI": [r"\bprescriptive\b", r"\btreatment recommendation\b", r"\bdecision support\b"],
    "Descriptive AI": [r"\bdescriptive\b", r"\bpattern recognition\b", r"\bclustering\b", r"\bphenotype discovery\b"],
    "AI Agents": [r"\bAI agent\b", r"\bautonomous AI\b", r"\bvirtual assistant\b", r"\bAI companion\b"]
}

# === FUNCTION TO DOWNLOAD AND EXTRACT PDF TEXT ===
def extract_text_from_pdf_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open("temp.pdf", "wb") as f:
            f.write(response.content)
        reader = PdfReader("temp.pdf")
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
        os.remove("temp.pdf")
        return text.lower()
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return ""

# === PROCESSING SCRIPT ===
def classify_pdfs(csv_path):
    df = pd.read_csv(csv_path)
    results = []

    for index, row in df.iterrows():
        url = row[1]
        print(f"Processing: {url}")
        text = extract_text_from_pdf_url(url)
        matched_categories = []

        for category, patterns in category_keywords.items():
            if any(re.search(p, text) for p in patterns):
                matched_categories.append(category)

        results.append({"pdf_url": url, "categories": ", ".join(matched_categories) if matched_categories else "None"})

    result_df = pd.DataFrame(results)
    result_df.to_csv(OUTPUT_PATH, index=False)
    print(f"\nDone. Results saved to: {OUTPUT_PATH}")

# === RUN ===
if __name__ == "__main__":
    classify_pdfs(CSV_PATH)


Processing: https://www.cms.gov/Regulations-and-Guidance/Guidance/Manuals/downloads/som107ap_a_hospitals.pdf
Processing: https://nam.edu/wp-content/uploads/2021/07/4.3-AI-in-Health-Care-title-authors-summary.pdf
Processing: https://www.chfs.ky.gov/agencies/dph/oc/Documents/FY24-Clinical-Service-Guide.pdf
Error processing https://www.chfs.ky.gov/agencies/dph/oc/Documents/FY24-Clinical-Service-Guide.pdf: 403 Client Error: Forbidden for url: https://www.chfs.ky.gov/agencies/dph/oc/Documents/FY24-Clinical-Service-Guide.pdf
Processing: https://dcr.hawaii.gov/wp-content/uploads/2024/08/2024-Community-Resource-Guide-Part-1.pdf
Processing: https://www.maine.gov/dhhs/mecdc/environmental-health/plumb/documents/rules/CMR%20241%2007-1995.pdf
Error processing https://www.maine.gov/dhhs/mecdc/environmental-health/plumb/documents/rules/CMR%20241%2007-1995.pdf: 404 Client Error: Not Found for url: https://www.maine.gov/dhhs/mecdc/environmental-health/plumb/documents/rules/CMR%20241%2007-1995.pdf
Proce

Second pass

In [None]:
import pandas as pd
import requests
import os
import re
from PyPDF2 import PdfReader

# === CONFIGURATION ===
CSV_PATH = "/Users/winnie/Documents/GitHub/MedAI/classify_by_state/sampled_20_rows.csv"  # Path to your CSV
OUTPUT_PATH = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/pdf_ai_categories2.csv" 
TEXT_DIR = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/pdf_texts"

# === Ensure output directory exists ===
os.makedirs(TEXT_DIR, exist_ok=True)

# === Category Keywords ===
category_keywords = {
    "Predictive AI": [
        r"\bpredictive model(s)?\b",
        r"\bpredictive analytics\b",
        r"\bpredictive algorithm(s)?\b",
        r"\brisk prediction model(s)?\b",
        r"\bmachine learning (for|based on) prediction\b",
        r"\bpredicting (outcomes|disease|risk)\b"
    ],
    "Generative AI": [
        r"\bgenerative ai\b",
        r"\b(ai[- ]generated|ai[- ]powered generation)\b",
        r"\blarge language model(s)?\b",
        r"\blanguage generation\b",
        r"\bsynthetic data generation\b",
        r"\bgpt[- ]?[\d]+\b",
        r"\bchatbot(s)?\b",
        r"\btext synthesis\b"
    ],
    "Prescriptive AI": [
        r"\bprescriptive analytics\b",
        r"\btreatment recommendation system(s)?\b",
        r"\bai[- ]based decision support\b",
        r"\bclinical decision support system(s)?\b",
        r"\btherapy recommendation\b"
    ],
    "Descriptive AI": [
        r"\bdescriptive analytics\b",
        r"\bun\-?supervised learning\b",
        r"\bpattern recognition\b",
        r"\bphenotype (clustering|discovery)\b",
        r"\banomaly detection\b",
        r"\bclinical data exploration\b"
    ],
    "AI Agents": [
        r"\b(ai|intelligent) agent(s)?\b",
        r"\bautonomous ai system(s)?\b",
        r"\bvirtual (assistant|agent|companion)\b",
        r"\bdigital health agent(s)?\b",
        r"\binteractive ai system(s)?\b",
        r"\bconversational ai\b"
    ]
}


# === Download and Save PDF Text ===
def save_pdf_text_from_url(url, filename):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
            "Accept": "application/pdf",
            "Accept-Language": "en-US,en;q=0.9",
            "Referer": "https://www.education.nh.gov/"
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        with open("temp.pdf", "wb") as f:
            f.write(response.content)

        reader = PdfReader("temp.pdf")
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

        os.remove("temp.pdf")

        text_path = os.path.join(TEXT_DIR, filename)
        with open(text_path, "w", encoding="utf-8") as f:
            f.write(text)

        return text_path

    except Exception as e:
        print(f"Error downloading or reading {url}: {e}")
        return None


# === Classify a Text File ===
def classify_text_file(text_path):
    try:
        with open(text_path, "r", encoding="utf-8") as f:
            text = f.read().lower()
        matched = []
        for category, patterns in category_keywords.items():
            if any(re.search(p, text) for p in patterns):
                matched.append(category)
        return ", ".join(matched) if matched else "None"
    except Exception as e:
        print(f"Error reading text file {text_path}: {e}")
        return "Error"

# === Main Processing Function ===
def process_documents(csv_path):
    df = pd.read_csv(csv_path)
    results = []

    for index, row in df.iterrows():
        url = row["PDF Link"]
        print(f"Processing ({index + 1}/{len(df)}): {url}")
        filename = f"doc_{index + 1}.txt"

        text_path = save_pdf_text_from_url(url, filename)
        if text_path:
            categories = classify_text_file(text_path)
        else:
            categories = "Download Error"

        results.append({"pdf_url": url, "categories": categories})

    result_df = pd.DataFrame(results)
    result_df.to_csv(OUTPUT_PATH, index=False)
    print(f"\nClassification complete. Results saved to {OUTPUT_PATH}")

# === Run Script ===
if __name__ == "__main__":
    process_documents(CSV_PATH)


Processing (1/20): https://www.cms.gov/Regulations-and-Guidance/Guidance/Manuals/downloads/som107ap_a_hospitals.pdf
Processing (2/20): https://nam.edu/wp-content/uploads/2021/07/4.3-AI-in-Health-Care-title-authors-summary.pdf
Processing (3/20): https://www.chfs.ky.gov/agencies/dph/oc/Documents/FY24-Clinical-Service-Guide.pdf
Processing (4/20): https://dcr.hawaii.gov/wp-content/uploads/2024/08/2024-Community-Resource-Guide-Part-1.pdf
Processing (5/20): https://www.maine.gov/dhhs/mecdc/environmental-health/plumb/documents/rules/CMR%20241%2007-1995.pdf
Error downloading or reading https://www.maine.gov/dhhs/mecdc/environmental-health/plumb/documents/rules/CMR%20241%2007-1995.pdf: 404 Client Error: Not Found for url: https://www.maine.gov/dhhs/mecdc/environmental-health/plumb/documents/rules/CMR%20241%2007-1995.pdf
Processing (6/20): https://stacks.cdc.gov/view/cdc/103606/cdc_103606_DS1.pdf
Processing (7/20): https://www.ahima.org/media/gq5jeclv/recertification_guide_2021.pdf
Processing (8

Third pass

In [None]:
import pandas as pd
import requests
import os
import re
from PyPDF2 import PdfReader

# === CONFIGURATION ===
CSV_PATH = "/Users/winnie/Documents/GitHub/MedAI/classify_by_state/sampled_20_rows.csv"
OUTPUT_PATH = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/pdf_ai_categories3.csv" 
TEXT_DIR = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/pdf_texts"

# === Ensure output directory exists ===
os.makedirs(TEXT_DIR, exist_ok=True)

# === Category Keywords ===
category_keywords = {
    "Predictive AI": [
        r"\bpredictive model(s)?\b",
        r"\bpredictive analytics\b",
        r"\bpredictive algorithm(s)?\b",
        r"\brisk prediction model(s)?\b",
        r"\bmachine learning (for|based on) prediction\b",
        r"\bpredicting (outcomes|disease|risk)\b"
    ],
    "Generative AI": [
        r"\bgenerative ai\b",
        r"\b(ai[- ]generated|ai[- ]powered generation)\b",
        r"\blarge language model(s)?\b",
        r"\blanguage generation\b",
        r"\bsynthetic data generation\b",
        r"\bgpt[- ]?[\d]+\b",
        r"\bchatbot(s)?\b",
        r"\btext synthesis\b",
        r"\bgenai\b"
    ],
    "Prescriptive AI": [
        r"\bprescriptive analytics\b",
        r"\btreatment recommendation system(s)?\b",
        r"\bai[- ]based decision support\b",
        r"\bclinical decision support system(s)?\b",
        r"\btherapy recommendation\b"
    ],
    "Descriptive AI": [
        r"\bdescriptive analytics\b",
        r"\bun\-?supervised learning\b",
        r"\bpattern recognition\b",
        r"\bphenotype (clustering|discovery)\b",
        r"\banomaly detection\b",
        r"\bclinical data exploration\b"
    ],
    "AI Agents": [
        r"\b(ai|intelligent) agent(s)?\b",
        r"\bautonomous ai system(s)?\b",
        r"\bvirtual (assistant|agent|companion)\b",
        r"\bdigital health agent(s)?\b",
        r"\binteractive ai system(s)?\b",
        r"\bconversational ai\b"
    ]
}


In [8]:
# Extracting pdf & Converting pdf to text file
def extract_pdf_texts(csv_path, text_dir):
    df = pd.read_csv(csv_path)

    for index, row in df.iterrows():
        url = row["PDF Link"]
        filename = f"doc_{index + 1}.txt"
        text_path = os.path.join(text_dir, filename)

        if os.path.exists(text_path):
            print(f"Skipping (already exists): {filename}")
            continue

        print(f"Downloading and extracting: {url}")
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
                "Accept": "application/pdf",
                "Accept-Language": "en-US,en;q=0.9",
                "Referer": "https://www.education.nh.gov/"
            }
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            with open("temp.pdf", "wb") as f:
                f.write(response.content)

            reader = PdfReader("temp.pdf")
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

            os.remove("temp.pdf")

            with open(text_path, "w", encoding="utf-8") as f:
                f.write(text)

        except Exception as e:
            print(f"Error downloading or reading {url}: {e}")

# Classifying text files by keywords
def classify_text_files(csv_path, text_dir, output_path):
    df = pd.read_csv(csv_path)
    results = []

    for index, row in df.iterrows():
        url = row["PDF Link"]
        filename = f"doc_{index + 1}.txt"
        text_path = os.path.join(text_dir, filename)

        if not os.path.exists(text_path):
            print(f"Missing text file: {filename} — Skipping.")
            categories = "Missing Text"
        else:
            try:
                with open(text_path, "r", encoding="utf-8") as f:
                    text = f.read().lower()
                matched = []
                for category, patterns in category_keywords.items():
                    if any(re.search(p, text) for p in patterns):
                        matched.append(category)
                categories = ", ".join(matched) if matched else "None"
            except Exception as e:
                print(f"Error reading text file {filename}: {e}")
                categories = "Read Error"

        results.append({"pdf_url": url, "categories": categories})

    result_df = pd.DataFrame(results)
    result_df.to_csv(output_path, index=False)
    print(f"Classification complete. Results saved to {output_path}")

In [9]:
# Run for extracting pdfs and converting them to text files

extract_pdf_texts(CSV_PATH, TEXT_DIR)

Skipping (already exists): doc_1.txt
Skipping (already exists): doc_2.txt
Skipping (already exists): doc_3.txt
Skipping (already exists): doc_4.txt
Downloading and extracting: https://www.maine.gov/dhhs/mecdc/environmental-health/plumb/documents/rules/CMR%20241%2007-1995.pdf
Error downloading or reading https://www.maine.gov/dhhs/mecdc/environmental-health/plumb/documents/rules/CMR%20241%2007-1995.pdf: 404 Client Error: Not Found for url: https://www.maine.gov/dhhs/mecdc/environmental-health/plumb/documents/rules/CMR%20241%2007-1995.pdf
Skipping (already exists): doc_6.txt
Skipping (already exists): doc_7.txt
Skipping (already exists): doc_8.txt
Skipping (already exists): doc_9.txt
Skipping (already exists): doc_10.txt
Skipping (already exists): doc_11.txt
Skipping (already exists): doc_12.txt
Skipping (already exists): doc_13.txt
Skipping (already exists): doc_14.txt
Skipping (already exists): doc_15.txt
Skipping (already exists): doc_16.txt
Skipping (already exists): doc_17.txt
Skipp

In [10]:
# Run for classifying the text files by keyword

classify_text_files(CSV_PATH, TEXT_DIR, OUTPUT_PATH)

Missing text file: doc_5.txt — Skipping.
Classification complete. Results saved to /Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/pdf_ai_categories3.csv


Fourth pass with good_docs

In [None]:
import pandas as pd
import requests
import os
import re
from PyPDF2 import PdfReader

# === CONFIGURATION ===
CSV_PATH = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/good_docs.csv"
OUTPUT_PATH = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/pdf_ai_categories4.csv" 
TEXT_DIR = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/good_docs_texts"

# === Ensure output directory exists ===
os.makedirs(TEXT_DIR, exist_ok=True)

# === Category Keywords ===
category_keywords = {
    "Predictive AI": [
        r"\bpredictive model(s)?\b",
        r"\bpredictive analytics\b",
        r"\bpredictive algorithm(s)?\b",
        r"\brisk prediction model(s)?\b",
        r"\bmachine learning (for|based on) prediction\b",
        r"\bmachine learning\b",
        r"\bpredicting (outcomes|disease|risk)\b"
    ],
    "Generative AI": [
        r"\bgenerative ai\b",
        r"\b(ai[- ]generated|ai[- ]powered generation)\b",
        r"\blarge language model(s)?\b",
        r"\blanguage generation\b",
        r"\bsynthetic data generation\b",
        r"\bgpt[- ]?[\d]+\b",
        r"\bchatbot(s)?\b",
        r"\btext synthesis\b",
        r"\bgenai\b"
    ],
    "Prescriptive AI": [
        r"\bprescriptive analytics\b",
        r"\btreatment recommendation system(s)?\b",
        r"\bai[- ]based decision support\b",
        r"\bclinical decision support system(s)?\b",
        r"\btherapy recommendation\b"
    ],
    "Descriptive AI": [
        r"\bdescriptive analytics\b",
        r"\bun\-?supervised learning\b",
        r"\bpattern recognition\b",
        r"\bphenotype (clustering|discovery)\b",
        r"\banomaly detection\b",
        r"\bclinical data exploration\b"
    ],
    "AI Agents": [
        r"\b(ai|intelligent) agent(s)?\b",
        r"\bautonomous ai system(s)?\b",
        r"\bvirtual (assistant|agent|companion)\b",
        r"\bdigital health agent(s)?\b",
        r"\binteractive ai system(s)?\b",
        r"\bconversational ai\b"
    ]
}


In [12]:
# Extracting pdf & Converting pdf to text file
def extract_pdf_texts(csv_path, text_dir):
    df = pd.read_csv(csv_path)

    for index, row in df.iterrows():
        url = row["PDF Link"]
        filename = f"doc_{index + 1}.txt"
        text_path = os.path.join(text_dir, filename)

        if os.path.exists(text_path):
            print(f"Skipping (already exists): {filename}")
            continue

        print(f"Downloading and extracting: {url}")
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
                "Accept": "application/pdf",
                "Accept-Language": "en-US,en;q=0.9",
                "Referer": "https://www.education.nh.gov/"
            }
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            with open("temp.pdf", "wb") as f:
                f.write(response.content)

            reader = PdfReader("temp.pdf")
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

            os.remove("temp.pdf")

            with open(text_path, "w", encoding="utf-8") as f:
                f.write(text)

        except Exception as e:
            print(f"Error downloading or reading {url}: {e}")

# Classifying text files by keywords
def classify_text_files(csv_path, text_dir, output_path):
    df = pd.read_csv(csv_path)
    results = []

    for index, row in df.iterrows():
        url = row["PDF Link"]
        filename = f"doc_{index + 1}.txt"
        text_path = os.path.join(text_dir, filename)

        if not os.path.exists(text_path):
            print(f"Missing text file: {filename} — Skipping.")
            categories = "Missing Text"
        else:
            try:
                with open(text_path, "r", encoding="utf-8") as f:
                    text = f.read().lower()
                matched = []
                for category, patterns in category_keywords.items():
                    if any(re.search(p, text) for p in patterns):
                        matched.append(category)
                categories = ", ".join(matched) if matched else "None"
            except Exception as e:
                print(f"Error reading text file {filename}: {e}")
                categories = "Read Error"

        results.append({"pdf_url": url, "categories": categories})

    result_df = pd.DataFrame(results)
    result_df.to_csv(output_path, index=False)
    print(f"Classification complete. Results saved to {output_path}")

In [13]:
# Run for extracting pdfs and converting them to text files

extract_pdf_texts(CSV_PATH, TEXT_DIR)

Downloading and extracting: https://www.ama-assn.org/system/files/clrpd-report-generative-ai.pdf
Downloading and extracting: https://watech.wa.gov/sites/default/files/2024-07/Ethical%20Considerations%20in%20the%20use%20of%20Artificial%20Intelligence%20in%20Healthcare%2C%20and%20Washington%27s%20approach%20to%20Generative%20AI.pdf
Downloading and extracting: https://www.healthit.gov/sites/default/files/jsr-17-task-002_aiforhealthandhealthcare12122017.pdf
Downloading and extracting: https://www.spiedigitallibrary.org/journals/journal-of-medical-imaging/volume-10/issue-6/061104/Toward-fairness-in-artificial-intelligence-for-medical-image-analysis/10.1117/1.JMI.10.6.061104.pdf
Error downloading or reading https://www.spiedigitallibrary.org/journals/journal-of-medical-imaging/volume-10/issue-6/061104/Toward-fairness-in-artificial-intelligence-for-medical-image-analysis/10.1117/1.JMI.10.6.061104.pdf: EOF marker not found
Downloading and extracting: https://cdt.ca.gov/wp-content/uploads/2025/

In [14]:
# Run for classifying the text files by keyword

classify_text_files(CSV_PATH, TEXT_DIR, OUTPUT_PATH)

Missing text file: doc_4.txt — Skipping.
Classification complete. Results saved to /Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_AI_type/pdf_ai_categories4.csv
