In [3]:
import os
import time
import re
import fitz 
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

csv_path = r"C:\Users\joeva\Downloads\ai_gov\ai_filtered.csv"

In [None]:
def download_pdf(row, save_dir):
    url = row['PDF Link']
    filename = os.path.join(save_dir, os.path.basename(url))
    if os.path.exists(filename):
        return f"Already exists: {filename}"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept": "application/pdf",
        "Referer": "https://www.hhs.gov/"
    }

    try:
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(response.content)
            return f"Downloaded: {filename}"
        else:
            return f"Failed ({response.status_code}): {url}"
    except Exception as e:
        return f"Error: {url} -> {e}"

def batch_download(csv_path, save_dir, max_threads=10):
    os.makedirs(save_dir, exist_ok=True)
    df = pd.read_csv(csv_path)
    with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
        futures = [executor.submit(download_pdf, row, save_dir) for _, row in df.iterrows()]
        for f in concurrent.futures.as_completed(futures):
            print(f.result())

if __name__ == "__main__":
    save_dir = r"C:\Users\joeva\Downloads\ai_gov\data\raw"
    batch_download(csv_path, save_dir, max_threads=15)

In [None]:
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    return " ".join([page.get_text() for page in doc])

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def process_pdfs_from_csv(csv_path, input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(csv_path)

    for _, row in df.iterrows():
        pdf_name = os.path.basename(row["PDF Link"])
        pdf_path = os.path.join(input_dir, pdf_name)
        out_path = os.path.join(output_dir, pdf_name.replace(".pdf", ".txt"))

        if os.path.exists(out_path):
            print(f"Already processed: {out_path}")
            continue

        if os.path.exists(pdf_path):
            try:
                raw_text = extract_text_from_pdf(pdf_path)
                cleaned = clean_text(raw_text)
                with open(out_path, "w", encoding="utf-8", errors="replace") as f:
                    f.write(cleaned)
                print(f"Processed: {pdf_name}")
            except Exception as e:
                print(f"Failed to process {pdf_name}: {e}")
        else:
            print(f"Missing PDF file: {pdf_name}")

if __name__ == "__main__":
    process_pdfs_from_csv(r"C:\Users\joeva\Downloads\ai_gov\ai_filtered.csv", r"C:\Users\joeva\Downloads\ai_gov\data\raw", r"C:\Users\joeva\Downloads\ai_gov\data\processed")

In [None]:
def load_documents_from_folder(folder_path, filenames):
    docs = []
    for filename in filenames:
        txt_path = os.path.join(folder_path, filename.replace(".pdf", ".txt"))
        if os.path.exists(txt_path):
            with open(txt_path, "r", encoding="utf-8", errors="replace") as f:
                text = f.read().strip()
                if text:
                    docs.append(preprocess(text))
                else:
                    docs.append("") 
        else:
            docs.append("") 
    return docs

raw_df = pd.read_csv(csv_path)
raw_df = raw_df.copy()
raw_df['filename'] = raw_df['URL'].apply(lambda url: os.path.basename(url))
docs = load_documents_from_folder(r"C:\Users\joeva\Downloads\ai_gov\data\processed", raw_df['filename'].tolist())

In [None]:
def chunk_text(text, max_words=100):
    words = text.split()
    chunks = [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
    return chunks

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered = [w for w in words if w.lower() not in stop_words]
    return " ".join(filtered)

def get_sentence_transformer_embeddings(texts, model, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
        embeddings.extend(batch_embeddings)
    return embeddings

def get_avg_embedding_for_doc(text, model, max_words=100, emb_size=384):
    filtered_text = remove_stopwords(text)
    chunks = chunk_text(filtered_text, max_words=max_words)
    if not chunks:
        return np.zeros(emb_size)
    chunk_embeddings = get_sentence_transformer_embeddings(chunks, model)
    return np.mean(chunk_embeddings, axis=0) if chunk_embeddings else np.zeros(emb_size)

if __name__ == "__main__":
    import nltk
    nltk.download('stopwords')

    csv_path = r"C:\Users\joeva\Downloads\ai_gov\ai_filtered.csv"
    folder = r"C:\Users\joeva\Downloads\ai_gov\data\processed"

    df = pd.read_csv(csv_path)
    filenames = [os.path.basename(url) for url in df["URL"].dropna()]
    docs = load_documents_from_folder(folder, filenames)

    print(f"Loaded and preprocessed {len(docs)} documents")

    model = SentenceTransformer('all-MiniLM-L6-v2')  
    emb_size = model.get_sentence_embedding_dimension()

    batch_size = 10
    embeddings = []

    for i in tqdm(range(0, len(docs), batch_size), desc="Embedding batches", unit="batch"):
        batch_docs = docs[i:i+batch_size]
        for doc in batch_docs:
            avg_emb = get_avg_embedding_for_doc(doc, model=model, emb_size=emb_size)
            embeddings.append(avg_emb)

    embeddings = np.array(embeddings)
    print(f"Generated embeddings shape: {embeddings.shape}")

In [None]:
hand_df['is_labeled'] = hand_df['Focus Area'].notna() & (hand_df['Focus Area'].str.strip() != '')

hand_df['is_unclassified'] = hand_df.apply(
    lambda row: 1 if row['is_labeled'] and not row['theme_list'] else 0,
    axis=1
)

In [None]:
X = embeddings 

labeled_mask = hand_df['is_labeled'].values
X_labeled = embeddings[labeled_mask]
df_labeled = hand_df[labeled_mask].copy()
y_labeled = df_labeled['is_unclassified'].values

X_train = X_labeled[:246]
y_train = y_labeled[:246]

unlabeled_mask = ~hand_df['is_labeled'].values
X_unlabeled = embeddings[unlabeled_mask]
df_unlabeled = hand_df[unlabeled_mask].copy()

param_grid = {
    'C': [0.1, 1.0, 10.0],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
grid = GridSearchCV(lr, param_grid, cv=3, scoring='f1', verbose=1)
grid.fit(X_train, y_train)
best_clf = grid.best_estimator_

y_proba_unlabeled = best_clf.predict_proba(X_unlabeled)[:, 1]
threshold = 0.5
y_pred_unlabeled = (y_proba_unlabeled >= threshold).astype(int)

df_unlabeled['predicted_is_unclassified'] = y_pred_unlabeled
df_unlabeled['predicted_label'] = ['Unclassified' if pred == 1 else 'Classified' for pred in y_pred_unlabeled]

hand_df.loc[df_unlabeled.index, 'predicted_is_unclassified'] = df_unlabeled['predicted_is_unclassified']
hand_df.loc[df_unlabeled.index, 'predicted_label'] = df_unlabeled['predicted_label']

hand_df.to_csv("updated_with_predictions.csv", index=False)

In [None]:
df = pd.read_csv("updated_with_predictions.csv")

labeled_df = df[(df['is_labeled'] == True) & (df['Focus Area'] != 'Unclassified')]

retained_unlabeled_df = df[(df['is_labeled'] == False) & (df['predicted_label'] != 'Unclassified')]

eliminated_df = df[(df['is_labeled'] == False) & (df['predicted_label'] == 'Unclassified')]

final_df = pd.concat([labeled_df, retained_unlabeled_df], ignore_index=True)

cols_to_drop = [
    'Name',
    'Notes/Questions',
    'hand_order',
    'filename',
    'theme_list',
    'is_labeled',
    'is_unclassified',
    'predicted_is_unclassified',
    'predicted_label'
]

final_df_clean = final_df.drop(columns=cols_to_drop, errors='ignore')
eliminated_df_clean = eliminated_df.drop(columns=cols_to_drop, errors='ignore')

final_df_clean.to_csv("FINAL_dataset.csv", index=False)
eliminated_df_clean.to_csv("eliminated_docs.csv", index=False)