In [442]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import requests
import fitz  
import re
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import concurrent.futures
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
import time
import os
from dotenv import load_dotenv

csv_path = r"C:\Users\joeva\Downloads\ai_gov\FINAL_dataset.csv"

In [2]:
from openai import OpenAI, AzureOpenAI
import json
import regex as re

load_dotenv()  # Load from .env file

AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT_embeddings")
AZURE_API_KEY = os.getenv("AZURE_API_KEY_embeddings")
AZURE_API_VERSION = "2025-04-01-preview"

client = AzureOpenAI(
    azure_endpoint=AZURE_ENDPOINT,
    api_key=AZURE_API_KEY,
    api_version=AZURE_API_VERSION
)

In [27]:
def download_pdf(row, save_dir):
    url = row['PDF Link']
    filename = os.path.join(save_dir, os.path.basename(url))
    if os.path.exists(filename):
        return f"Already exists: {filename}"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept": "application/pdf",
        "Referer": "https://www.hhs.gov/"
    }

    try:
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(response.content)
            return f"Downloaded: {filename}"
        else:
            return f"Failed ({response.status_code}): {url}"
    except Exception as e:
        return f"Error: {url} -> {e}"

def batch_download(csv_path, save_dir, max_threads=10):
    os.makedirs(save_dir, exist_ok=True)
    df = pd.read_csv(csv_path)
    with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
        futures = [executor.submit(download_pdf, row, save_dir) for _, row in df.iterrows()]
        for f in concurrent.futures.as_completed(futures):
            print(f.result())

if __name__ == "__main__":
    save_dir = r"C:\Users\joeva\Downloads\ai_gov\data\raw"
    batch_download(csv_path, save_dir, max_threads=15)

Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw\P020011c.pdf
Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw\ONC-LEAP-in-Health-IT-SEN-FY2021.pdf
Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw\2024-04-11_John_Brownstein_Presentation_508.pdf
Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw\aspe-brief-human-services-health-data-linking.pdf
Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw\P040052c.pdf
Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw\Natalie%20Torentinos.pdf
Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw\K193271.pdf
Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw\GettingerModeratorSlidesAIPanelsforONCAnnualMeeting12720Final.pdf
Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw\ONC_AI_in_Health_IT_Showcase.pdf
Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw\HITAC_Annual_Report_for_FY24_508.pdf
Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw\405d-post-volxxv-july-2024.pdf
Downloaded: C:\Users\joeva\Downloads\ai_gov\data\raw

In [28]:
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    return " ".join([page.get_text() for page in doc])

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def process_pdfs_from_csv(csv_path, input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(csv_path)

    for _, row in df.iterrows():
        pdf_name = os.path.basename(row["PDF Link"])
        pdf_path = os.path.join(input_dir, pdf_name)
        out_path = os.path.join(output_dir, pdf_name.replace(".pdf", ".txt"))

        if os.path.exists(out_path):
            print(f"Already processed: {out_path}")
            continue

        if os.path.exists(pdf_path):
            try:
                raw_text = extract_text_from_pdf(pdf_path)
                cleaned = clean_text(raw_text)
                with open(out_path, "w", encoding="utf-8", errors="replace") as f:
                    f.write(cleaned)
                print(f"Processed: {pdf_name}")
            except Exception as e:
                print(f"Failed to process {pdf_name}: {e}")
        else:
            print(f"Missing PDF file: {pdf_name}")

if __name__ == "__main__":
    process_pdfs_from_csv(r"C:\Users\joeva\Downloads\ai_gov\ai_filtered.csv", r"C:\Users\joeva\Downloads\ai_gov\data\raw", r"C:\Users\joeva\Downloads\ai_gov\data\processed")

Already processed: C:\Users\joeva\Downloads\ai_gov\data\processed\US-FDA-Artificial-Intelligence-and-Machine-Learning-Discussion-Paper.txt
Already processed: C:\Users\joeva\Downloads\ai_gov\data\processed\K221624.txt
Already processed: C:\Users\joeva\Downloads\ai_gov\data\processed\K193271.txt
Already processed: C:\Users\joeva\Downloads\ai_gov\data\processed\K232672.txt
Already processed: C:\Users\joeva\Downloads\ai_gov\data\processed\P020011c.txt
Already processed: C:\Users\joeva\Downloads\ai_gov\data\processed\P040052c.txt
Already processed: C:\Users\joeva\Downloads\ai_gov\data\processed\405d-post-volxxv-july-2024.txt
Already processed: C:\Users\joeva\Downloads\ai_gov\data\processed\trustworthy-ai.txt
Already processed: C:\Users\joeva\Downloads\ai_gov\data\processed\fy2024-ospcortf-annual-report.txt
Already processed: C:\Users\joeva\Downloads\ai_gov\data\processed\aspe-brief-human-services-health-data-linking.txt
Already processed: C:\Users\joeva\Downloads\ai_gov\data\processed\prepa

In [443]:
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    words = text.lower().split()
    words = [lemmatizer.lemmatize(w) for w in words if w.isalpha()]
    return " ".join(words)

def load_documents_from_folder(folder_path, filenames):
    docs = []
    for filename in filenames:
        txt_path = os.path.join(folder_path, filename.replace(".pdf", ".txt"))
        if os.path.exists(txt_path):
            with open(txt_path, "r", encoding="utf-8", errors="replace") as f:
                text = f.read().strip()
                if text:
                    docs.append(preprocess(text))
                else:
                    docs.append("") 
        else:
            docs.append("") 
    return docs

In [446]:
raw_df = pd.read_csv(csv_path)
raw_df = raw_df.copy()
raw_df['filename'] = raw_df['URL'].apply(lambda url: os.path.basename(url))
docs = load_documents_from_folder(r"C:\Users\joeva\Downloads\ai_gov\data\processed", raw_df['filename'].tolist())

In [447]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer

def chunk_text(text, max_words=100):
    words = text.split()
    chunks = [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
    return chunks

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered = [w for w in words if w.lower() not in stop_words]
    return " ".join(filtered)

def get_sentence_transformer_embeddings(texts, model, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
        embeddings.extend(batch_embeddings)
    return embeddings

def get_avg_embedding_for_doc(text, model, max_words=100, emb_size=384):
    filtered_text = remove_stopwords(text)
    chunks = chunk_text(filtered_text, max_words=max_words)
    if not chunks:
        return np.zeros(emb_size)
    chunk_embeddings = get_sentence_transformer_embeddings(chunks, model)
    return np.mean(chunk_embeddings, axis=0) if chunk_embeddings else np.zeros(emb_size)

if __name__ == "__main__":
    import nltk
    nltk.download('stopwords')

    csv_path = r"C:\Users\joeva\Downloads\ai_gov\FINAL_dataset.csv"
    folder = r"C:\Users\joeva\Downloads\ai_gov\data\processed"

    df = pd.read_csv(csv_path)
    filenames = [os.path.basename(url) for url in df["URL"].dropna()]
    docs = load_documents_from_folder(folder, filenames)

    print(f"Loaded and preprocessed {len(docs)} documents")

    model = SentenceTransformer('all-MiniLM-L6-v2')  
    emb_size = model.get_sentence_embedding_dimension()

    batch_size = 10
    embeddings = []

    for i in tqdm(range(0, len(docs), batch_size), desc="Embedding batches", unit="batch"):
        batch_docs = docs[i:i+batch_size]
        for doc in batch_docs:
            avg_emb = get_avg_embedding_for_doc(doc, model=model, emb_size=emb_size)
            embeddings.append(avg_emb)

    embeddings = np.array(embeddings)
    print(f"Generated embeddings shape: {embeddings.shape}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joeva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loaded and preprocessed 1207 documents


Embedding batches: 100%|██████████| 121/121 [18:40<00:00,  9.26s/batch]

Generated embeddings shape: (1207, 384)





In [448]:
np.save("embeddings_FINAL.npy", embeddings)

In [None]:
embeddings = np.load("embeddings_FINAL.npy")

In [459]:
def parse_themes(theme_str):
    if pd.isna(theme_str):
        return []
    themes = [t.strip() for t in theme_str.split(',')]
    if 'Unclassified' in themes:
        return [] 
    return themes

hand_df['theme_list'] = hand_df['Focus Area'].apply(parse_themes)

In [460]:
from sklearn.preprocessing import MultiLabelBinarizer

THEMES = [
    'Bias and Fairness', 
    'Reliability and Performance', 
    'Privacy and Security', 
    'Transparency and Explainability', 
    'Ethics and Consent', 
    'Safety and Risk Management', 
    'Governance Committees',
]

canonical_labels = {
    'Transparency and Explainability': 'Transparency and Explainability',
    'Transparency and Explanability': 'Transparency and Explainability',
    'Privacy and Security': 'Privacy and Security',
    'Safety and Risk Management': 'Safety and Risk Management',
    'Safety Risk and Management': 'Safety and Risk Management',
    'Bias and Fairness': 'Bias and Fairness',
    'Ethics and Consent': 'Ethics and Consent',
    'Governance Committees': 'Governance Committees',
    'Reliability and Performance': 'Reliability and Performance',
}

def normalize_labels(label_list):
    cleaned = set()
    for label in label_list:
        label = label.strip()
        if not label or label.lower() == 'unclassified':
            continue
        cleaned.add(canonical_labels.get(label, label))
    if not cleaned:
        cleaned.add('Unclassified')
    return list(cleaned)

labels = hand_df['Focus Area'].fillna('').apply(lambda x: normalize_labels(x.split(',')))

mlb = MultiLabelBinarizer(classes=THEMES)
y = mlb.fit_transform(labels)



In [484]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import numpy as np

X = embeddings

X_train = X[:55]
y_train = y[:55]
X_test = X[55:73]
y_test = y[55:73]
df_test = hand_df.iloc[55:73]

param_grid = {
    'estimator__C': [0.1, 1.0, 10.0],
    'estimator__penalty': ['l2'],
    'estimator__solver': ['lbfgs']
}

base_lr = LogisticRegression(max_iter=1000, class_weight='balanced')
clf = OneVsRestClassifier(base_lr)

grid = GridSearchCV(clf, param_grid, cv=3, scoring='f1_micro', verbose=1)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
best_clf = grid.best_estimator_

y_proba = best_clf.predict_proba(X_test)
threshold = 0.35
y_pred = (y_proba >= threshold).astype(int)

predicted_labels = mlb.inverse_transform(y_pred)
hand_df.loc[55:73, 'predicted_labels'] = pd.Series(predicted_labels, index=hand_df.index[55:73])
hand_df.to_csv("updated_with_predictions.csv", index=False)

print(classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=0))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters: {'estimator__C': 10.0, 'estimator__penalty': 'l2', 'estimator__solver': 'lbfgs'}
                                 precision    recall  f1-score   support

              Bias and Fairness       0.80      0.89      0.84         9
    Reliability and Performance       0.69      1.00      0.81        11
           Privacy and Security       0.56      1.00      0.71        10
Transparency and Explainability       0.88      0.70      0.78        10
             Ethics and Consent       0.56      1.00      0.72         9
     Safety and Risk Management       0.61      1.00      0.76        11
          Governance Committees       0.38      0.86      0.52         7

                      micro avg       0.61      0.93      0.73        67
                      macro avg       0.64      0.92      0.74        67
                   weighted avg       0.65      0.93      0.75        67
                    samples avg       