In [6]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import requests
import fitz  
import re
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import concurrent.futures
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
import time
import os
from dotenv import load_dotenv

csv_path = r"C:\Users\joeva\Downloads\ai_gov\data_collection\data\cleaned_raw_pdfs.csv"

In [3]:
from openai import OpenAI, AzureOpenAI
import json
import regex as re

load_dotenv()  # Load from .env file

AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT_embeddings")
AZURE_API_KEY = os.getenv("AZURE_API_KEY_embeddings")
AZURE_API_VERSION = "2025-04-01-preview"

client = AzureOpenAI(
    azure_endpoint=AZURE_ENDPOINT,
    api_key=AZURE_API_KEY,
    api_version=AZURE_API_VERSION
)

In [None]:
def download_pdf(row, save_dir):
    url = row['PDF Link']
    filename = os.path.join(save_dir, os.path.basename(url))
    if os.path.exists(filename):
        return f"Already exists: {filename}"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept": "application/pdf",
        "Referer": "https://www.hhs.gov/"
    }

    try:
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(response.content)
            return f"Downloaded: {filename}"
        else:
            return f"Failed ({response.status_code}): {url}"
    except Exception as e:
        return f"Error: {url} -> {e}"

def batch_download(csv_path, save_dir, max_threads=10):
    os.makedirs(save_dir, exist_ok=True)
    df = pd.read_csv(csv_path)
    with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
        futures = [executor.submit(download_pdf, row, save_dir) for _, row in df.iterrows()]
        for f in concurrent.futures.as_completed(futures):
            print(f.result())

if __name__ == "__main__":
    save_dir = r"C:\Users\joeva\Downloads\ai_gov\data\raw"
    batch_download(csv_path, save_dir, max_threads=15)

Already exists: C:\Users\joeva\Downloads\ai_gov\data\raw\115PM-OSC-Concept-Clearance-PRIMED-AI-Chiang-Tromberg-508.pdf
Already exists: C:\Users\joeva\Downloads\ai_gov\data\raw\quality-motion-cms-national-quality-strategy.pdf
Already exists: C:\Users\joeva\Downloads\ai_gov\data\raw\AR410.pdf
Already exists: C:\Users\joeva\Downloads\ai_gov\data\raw\cms-2023-omh-z-code-resource.pdf
Already exists: C:\Users\joeva\Downloads\ai_gov\data\raw\Oahu-2018.pdf
Failed (404): https://pmc.ncbi.nlm.nih.gov/articles/PMC9815490/pdf/nihms-1860115.pdf
Already exists: C:\Users\joeva\Downloads\ai_gov\data\raw\ONC-LEAP-in-Health-IT-SEN-FY2021.pdf
Already exists: C:\Users\joeva\Downloads\ai_gov\data\raw\2024-kohs-report.pdf
Already exists: C:\Users\joeva\Downloads\ai_gov\data\raw\NIST.IR.8084.pdf
Already exists: C:\Users\joeva\Downloads\ai_gov\data\raw\QCA-Report.pdf
Already exists: C:\Users\joeva\Downloads\ai_gov\data\raw\AI-Supplement-Dr.-Alaa-Youssef-Presentation-508.pdf
Already exists: C:\Users\joeva\Down

In [None]:
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    return " ".join([page.get_text() for page in doc])

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def process_pdfs_from_csv(csv_path, input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(csv_path)

    for _, row in df.iterrows():
        pdf_name = os.path.basename(row["PDF Link"])
        pdf_path = os.path.join(input_dir, pdf_name)
        out_path = os.path.join(output_dir, pdf_name.replace(".pdf", ".txt"))

        if os.path.exists(pdf_path):
            try:
                raw_text = extract_text_from_pdf(pdf_path)
                cleaned = clean_text(raw_text)
                with open(out_path, "w", encoding="utf-8", errors="replace") as f:
                    f.write(cleaned)
            except Exception as e:
                print(f"Failed to process {pdf_name}: {e}")

if __name__ == "__main__":
    process_pdfs_from_csv(r"C:\Users\joeva\Downloads\ai_gov\data_collection\data\cleaned_raw_pdfs.csv", r"C:\Users\joeva\Downloads\ai_gov\data\raw", r"C:\Users\joeva\Downloads\ai_gov\data\processed")

Failed to process perm-ry-2025_tcm1053-650717.pdf: Failed to open file 'C:\\Users\\joeva\\Downloads\\ai_gov\\data\\raw\\perm-ry-2025_tcm1053-650717.pdf'.
Failed to process 2023-atc-annual-report_tcm1045-614285.pdf: Failed to open file 'C:\\Users\\joeva\\Downloads\\ai_gov\\data\\raw\\2023-atc-annual-report_tcm1045-614285.pdf'.
Failed to process human-services.pdf: Failed to open file 'C:\\Users\\joeva\\Downloads\\ai_gov\\data\\raw\\human-services.pdf'.
Failed to process 77-MAN-STR.pdf: Failed to open file 'C:\\Users\\joeva\\Downloads\\ai_gov\\data\\raw\\77-MAN-STR.pdf'.
MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error

In [7]:
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    words = text.lower().split()
    words = [lemmatizer.lemmatize(w) for w in words if w.isalpha()]
    return " ".join(words)

def load_documents_from_folder(folder_path, filenames):
    docs = []
    for filename in filenames:
        txt_path = os.path.join(folder_path, filename.replace(".pdf", ".txt"))
        if os.path.exists(txt_path):
            with open(txt_path, "r", encoding="utf-8", errors="replace") as f:
                text = f.read().strip()
                if text:
                    docs.append(preprocess(text))
                else:
                    docs.append("") 
        else:
            docs.append("") 
    return docs

In [10]:
raw_df = pd.read_csv(csv_path)
raw_df = raw_df.iloc[:200].copy()
raw_df['filename'] = raw_df['PDF Link'].apply(lambda url: os.path.basename(url))
docs = load_documents_from_folder(r"C:\Users\joeva\Downloads\ai_gov\data\processed", raw_df['filename'].tolist())
raw_df['text'] = docs
for i, row in raw_df.iterrows():
    print(f"Row {i} - URL: {row['PDF Link']} - Filename: {row['filename']} - Text length: {len(row['text'])}")
print(f"Loaded and preprocessed {len(docs)} documents")

Row 0 - URL: https://www.fda.gov/files/medical%20devices/published/US-FDA-Artificial-Intelligence-and-Machine-Learning-Discussion-Paper.pdf - Filename: US-FDA-Artificial-Intelligence-and-Machine-Learning-Discussion-Paper.pdf - Text length: 41626
Row 1 - URL: https://www.accessdata.fda.gov/cdrh_docs/pdf22/K221624.pdf - Filename: K221624.pdf - Text length: 22905
Row 2 - URL: https://www.accessdata.fda.gov/cdrh_docs/pdf19/K193271.pdf - Filename: K193271.pdf - Text length: 18075
Row 3 - URL: https://www.accessdata.fda.gov/cdrh_docs/pdf23/K232672.pdf - Filename: K232672.pdf - Text length: 57247
Row 4 - URL: https://www.accessdata.fda.gov/cdrh_docs/pdf2/P020011c.pdf - Filename: P020011c.pdf - Text length: 57549
Row 5 - URL: https://www.fda.gov/files/food/published/GRAS-Notice-000581.pdf - Filename: GRAS-Notice-000581.pdf - Text length: 126697
Row 6 - URL: https://www.accessdata.fda.gov/cdrh_docs/pdf4/P040052c.pdf - Filename: P040052c.pdf - Text length: 43244
Row 7 - URL: https://www.hhs.gov/

In [15]:
import time
import pandas as pd
import numpy as np
import tiktoken
from nltk.corpus import stopwords

def chunk_text(text, max_tokens=500):
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(text)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i+max_tokens]
        chunk_text = enc.decode(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered = [w for w in words if w.lower() not in stop_words]
    return " ".join(filtered)

def get_openai_embeddings(texts, model="embedding3large", batch_size=200, max_retries=5):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        for attempt in range(max_retries):
            try:
                response = client.embeddings.create(
                    input=batch,
                    model=model
                )
                batch_embeddings = [data.embedding for data in response.data]
                embeddings.extend(batch_embeddings)
                time.sleep(1.1)  # delay between batches to avoid rate limits
                break  # success, exit retry loop
            except Exception as e:
                if "RateLimitError" in str(e) or "429" in str(e):
                    wait = 2 ** attempt
                    time.sleep(wait)
                else:
                    raise
        else:
            pass
    return embeddings

def get_avg_embedding_for_doc(text, model="embedding3large", batch_size=200, max_tokens=500, emb_size=3072):
    filtered_text = remove_stopwords(text)
    chunks = chunk_text(filtered_text, max_tokens=max_tokens)
    chunk_embeddings = get_openai_embeddings(chunks, model=model, batch_size=batch_size)
    if chunk_embeddings and isinstance(chunk_embeddings, list) and len(chunk_embeddings) > 0:
        avg_embedding = np.mean(chunk_embeddings, axis=0)
        if isinstance(avg_embedding, np.ndarray) and avg_embedding.size == emb_size:
            return avg_embedding
    return np.zeros(emb_size)

if __name__ == "__main__":
    import os
    import nltk
    nltk.download('stopwords')

    csv_path = r"C:\Users\joeva\Downloads\ai_gov\data_collection\data\cleaned_raw_pdfs.csv"
    folder = r"C:\Users\joeva\Downloads\ai_gov\data\processed"

    df = pd.read_csv(csv_path)
    filenames = [os.path.basename(url) for url in df["PDF Link"][:200]]
    
    docs = load_documents_from_folder(folder, filenames)
    print(f"Loaded and preprocessed {len(docs)} documents")

    # Generate averaged embeddings for each document
    embeddings = []
    for doc in docs:
        avg_emb = get_avg_embedding_for_doc(doc)
        embeddings.append(avg_emb)
    embeddings = np.array(embeddings)
    print(f"Generated embeddings shape: {embeddings.shape}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joeva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loaded and preprocessed 200 documents
Generated embeddings shape: (200, 3072)


In [11]:
hand_df = pd.read_csv(r"C:\Users\joeva\Downloads\ai_gov\Hand Labeling.csv")
hand_df = hand_df.iloc[:200].copy()
hand_df['filename'] = raw_df['filename'].values[:200]

In [12]:
def parse_themes(theme_str):
    if pd.isna(theme_str):
        return []
    themes = [t.strip() for t in theme_str.split(',')]
    if 'Unclassified' in themes:
        return [] 
    return themes

hand_df['theme_list'] = hand_df['Focus Area'].apply(parse_themes)

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer

THEMES = ['Bias and Fairness', 'Reliability and Performance', 'Privacy and Security', 'Transparency and Explainability', 'Ethics and Consent', 'Safety and Risk Management', 'Governance Committees']

mlb = MultiLabelBinarizer(classes=THEMES)
y = mlb.fit_transform(hand_df['theme_list'])

ints = pd.DataFrame(y, columns=mlb.classes_)



In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import numpy as np

X = embeddings

labels = hand_df['Focus Area'].fillna('').apply(lambda x: x.split(',') if x else [])

X_train = X[:60]              
y_train = y[:60]
X_test = X[60:116]
y_test = y[60:116]            
df_test = hand_df.iloc[60:116]

param_grid = {
    'estimator__C': [0.01, 0.1, 1, 10],
    'estimator__penalty': ['l2'],
    'estimator__solver': ['lbfgs'],
    'estimator__max_iter': [500]
}

base_lr = LogisticRegression(random_state=42)
clf = OneVsRestClassifier(base_lr)

grid = GridSearchCV(clf, param_grid, cv=3, scoring='f1_micro', verbose=1)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
best_clf = grid.best_estimator_

y_proba = best_clf.predict_proba(X_test)

threshold = 0.3
y_pred = (y_proba >= threshold).astype(int)

predicted_labels = mlb.inverse_transform(y_pred)

hand_df.loc[60:116, 'predicted_labels'] = pd.Series(predicted_labels, index=hand_df.index[60:116])

hand_df.to_csv("updated_with_predictions.csv", index=False)

print(classification_report(y_test, y_pred, target_names=mlb.classes_))

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters: {'estimator__C': 10, 'estimator__max_iter': 500, 'estimator__penalty': 'l2', 'estimator__solver': 'lbfgs'}
                                 precision    recall  f1-score   support

              Bias and Fairness       0.81      0.87      0.84        15
    Reliability and Performance       0.53      0.53      0.53        15
           Privacy and Security       0.61      0.55      0.58        20
Transparency and Explainability       0.50      0.90      0.64        10
             Ethics and Consent       0.67      0.63      0.65        19
     Safety and Risk Management       0.47      0.58      0.52        12
          Governance Committees       0.50      0.50      0.50        10

                      micro avg       0.59      0.64      0.62       101
                      macro avg       0.58      0.65      0.61       101
                   weighted avg       0.60      0.64      0.62       101
           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
