In [49]:
import fitz  
import os
import re
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt
import time
import requests
import umap
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

In [23]:
CSV_PATH = r"C:\Users\joeva\Downloads\ai_gov\Data Collection\Data\ai_governance_categorized_01.csv"             
DOWNLOAD_DIR = r"C:\Users\joeva\Downloads\ai_gov\data\raw"           
BATCH_SIZE = 1000                         
TIMEOUT = 30                           

os.makedirs(DOWNLOAD_DIR, exist_ok=True)

df = pd.read_csv(CSV_PATH)

if "Status" not in df.columns:
    df["Status"] = "pending"
if "Local Filename" not in df.columns:
    df["Local Filename"] = ""

pending_df = df[df["Status"] == "pending"].head(BATCH_SIZE)

print(f"Starting batch download of {len(pending_df)} PDFs...")

for idx, row in pending_df.iterrows():
    url = row["PDF Link"]
    local_filename = os.path.join(DOWNLOAD_DIR, f"doc_{idx}.pdf")
    
    try:
        response = requests.get(url, timeout=TIMEOUT)
        response.raise_for_status()
        
        with open(local_filename, "wb") as f:
            f.write(response.content)

        df.at[idx, "status"] = "done"
        df.at[idx, "local filename"] = local_filename
        print(f"downloaded: {url}")

    except Exception as e:
        df.at[idx, "Status"] = "error"
        print(f"failed to download {url} — {e}")

    time.sleep(1) 

df.to_csv(CSV_PATH, index=False)

Starting batch download of 1000 PDFs...
downloaded: https://www.hhs.texas.gov/sites/default/files/documents/eqro-annual-tech-report-contract-yr-2022.pdf
downloaded: https://portal.ct.gov/-/media/OPM/Fin-General/Policies/CT-Responsible-AI-Policy-Framework-Final-02012024.pdf
failed to download https://www.cga.ct.gov/gl/tfs/20230720_Task%20Force%20to%20study%20A.I.,%20and%20develop%20an%20A.I.%20bill%20of%20rights/20240201/CT%20AI%20Working%20Group%20Report.pdf — HTTPSConnectionPool(host='www.cga.ct.gov', port=443): Max retries exceeded with url: /gl/tfs/20230720_Task%20Force%20to%20study%20A.I.,%20and%20develop%20an%20A.I.%20bill%20of%20rights/20240201/CT%20AI%20Working%20Group%20Report.pdf (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1017)')))
downloaded: https://portal.ct.gov/-/media/opm/fin-cfo/the-state-of-connecticut-information-and-telecommunications-strategic-plan-fy25---

In [31]:
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    return " ".join([page.get_text() for page in doc])

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def process_pdfs(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    for fname in os.listdir(input_dir):
        if fname.endswith(".pdf"):
            pdf_path = os.path.join(input_dir, fname)
            try:
                raw_text = extract_text_from_pdf(pdf_path)
                cleaned = clean_text(raw_text)
                out_path = os.path.join(output_dir, fname.replace(".pdf", ".txt"))
                with open(out_path, "w", encoding="utf-8", errors="replace") as f:
                    f.write(cleaned)
            except Exception as e:
                print(f"Failed to process {fname}: {e}")

if __name__ == "__main__":
    process_pdfs(r"C:\Users\joeva\Downloads\ai_gov\data\raw", r"C:\Users\joeva\Downloads\ai_gov\data\processed")

Failed to process doc_447.pdf: Failed to open file 'C:\\Users\\joeva\\Downloads\\ai_gov\\data\\raw\\doc_447.pdf'.
Failed to process doc_448.pdf: Failed to open file 'C:\\Users\\joeva\\Downloads\\ai_gov\\data\\raw\\doc_448.pdf'.
Failed to process doc_450.pdf: Failed to open file 'C:\\Users\\joeva\\Downloads\\ai_gov\\data\\raw\\doc_450.pdf'.
Failed to process doc_775.pdf: Failed to open file 'C:\\Users\\joeva\\Downloads\\ai_gov\\data\\raw\\doc_775.pdf'.
Failed to process doc_777.pdf: Failed to open file 'C:\\Users\\joeva\\Downloads\\ai_gov\\data\\raw\\doc_777.pdf'.
Failed to process doc_779.pdf: Failed to open file 'C:\\Users\\joeva\\Downloads\\ai_gov\\data\\raw\\doc_779.pdf'.


In [45]:
def load_documents_from_folder(folder_path):
    docs = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".txt"):
            full_path = os.path.join(folder_path, filename)
            with open(full_path, "r", encoding="utf-8", errors="replace") as f:
                text = f.read().strip()
                if text:
                    docs.append(text)
                    filenames.append(filename)
    return docs, filenames

def get_top_terms_per_cluster(docs, labels, num_terms=10):
    df = pd.DataFrame({"doc": docs, "cluster": labels})
    cluster_terms = {}
    vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
    for cluster_id in set(labels):
        if cluster_id == -1:  
            continue
        cluster_docs = df[df["cluster"] == cluster_id]["doc"].tolist()
        X = vectorizer.fit_transform(cluster_docs)
        terms = vectorizer.get_feature_names_out()
        top_indices = X.sum(axis=0).A1.argsort()[::-1][:num_terms]
        top_words = [terms[i] for i in top_indices]
        cluster_terms[cluster_id] = top_words
    return cluster_terms

In [53]:
nltk.download('stopwords')
nltk.download('wordnet')

# Custom stopwords
custom_stopwords = set(stopwords.words('english') + ["ai", "use", "data", "state", "including", "systems"])
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    words = text.lower().split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in custom_stopwords and w.isalpha()]
    return " ".join(words)

def load_documents_from_folder(folder_path):
    docs, filenames = [], []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".txt"):
            full_path = os.path.join(folder_path, filename)
            with open(full_path, "r", encoding="utf-8", errors="replace") as f:
                text = f.read().strip()
                if text:
                    docs.append(preprocess(text))
                    filenames.append(filename)
    return docs, filenames

def get_top_terms_per_cluster(docs, labels, num_terms=7):
    df = pd.DataFrame({"doc": docs, "cluster": labels})
    cluster_terms = {}
    vectorizer = TfidfVectorizer(max_features=1000)
    for cluster_id in set(labels):
        if cluster_id == -1:
            continue
        cluster_docs = df[df["cluster"] == cluster_id]["doc"].tolist()
        X = vectorizer.fit_transform(cluster_docs)
        terms = vectorizer.get_feature_names_out()
        top_indices = X.sum(axis=0).A1.argsort()[::-1][:num_terms]
        top_words = [terms[i] for i in top_indices]
        cluster_terms[cluster_id] = top_words
    return cluster_terms

def show_sample_documents(docs, filenames, labels):
    df = pd.DataFrame({"Filename": filenames, "Text": docs, "Cluster": labels})
    for cluster_id in sorted(df["Cluster"].unique()):
        if cluster_id == -1:
            continue
        print(f"\nCluster {cluster_id} sample documents:")
        sample = df[df["Cluster"] == cluster_id].sample(n=min(3, len(df[df["Cluster"] == cluster_id])), random_state=42)
        for _, row in sample.iterrows():
            print(f"- {row['Filename']} | {row['Text'][:200]}...")

def compute_cluster_similarities(embeddings, labels):
    df = pd.DataFrame({"label": labels})
    cluster_means = {}
    for label in df["label"].unique():
        if label == -1:
            continue
        cluster_means[label] = np.mean(embeddings[labels == label], axis=0)
    keys = sorted(cluster_means.keys())
    matrix = cosine_similarity([cluster_means[k] for k in keys])
    sim_df = pd.DataFrame(matrix, index=keys, columns=keys)
    print("\nInter-cluster cosine similarity matrix:")
    print(sim_df.round(2))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joeva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joeva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
if __name__ == "__main__":
    folder = r"C:\Users\joeva\Downloads\ai_gov\data\processed"
    docs, filenames = load_documents_from_folder(folder)

    if not docs:
        print(f"No documents found in {folder}")
        exit(1)

    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(docs, show_progress_bar=True)

    reducer = umap.UMAP(n_neighbors=7, n_components=2, min_dist=0.1, metric='cosine', random_state=42)
    reduced_embeddings = reducer.fit_transform(embeddings)

    clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True)
    cluster_labels = clusterer.fit_predict(reduced_embeddings)

    cluster_terms = get_top_terms_per_cluster(docs, cluster_labels)
    for cid, words in cluster_terms.items():
        print(f"Cluster {cid}: {', '.join(words)}")

    show_sample_documents(docs, filenames, cluster_labels)

    compute_cluster_similarities(embeddings, cluster_labels)

    df = pd.DataFrame({
        "Filename": filenames,
        "Cluster": cluster_labels
    })
    df.to_csv("huggingface_cluster_assignments.csv", index=False)

    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=cluster_labels, cmap='tab10')
    plt.colorbar(scatter)
    plt.title("UMAP Projection of Document Clusters")
    plt.tight_layout()
    plt.savefig("huggingface_umap_plot.png")
    plt.show()

In [50]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_similarities(embeddings, filenames, threshold=0.95):
    sim_matrix = cosine_similarity(embeddings)
    similar_pairs = []

    for i in range(len(filenames)):
        for j in range(i + 1, len(filenames)):
            if sim_matrix[i, j] > threshold:
                similar_pairs.append((filenames[i], filenames[j], sim_matrix[i, j]))

    return similar_pairs

def get_outlier_documents(docs, filenames, cluster_labels, max_preview_chars=300):
    outliers = []
    for doc, fname, label in zip(docs, filenames, cluster_labels):
        if label == -1:  
            outliers.append({
                "Filename": fname,
                "Length": len(doc),
                "Preview": doc[:max_preview_chars].replace("\n", " ") + ("..." if len(doc) > max_preview_chars else "")
            })
    return outliers

def analyze_cluster_consistency(embeddings, labels, filenames):
    df = pd.DataFrame({'embedding': list(embeddings), 'label': labels, 'filename': filenames})
    consistency_scores = {}

    for cluster_id in set(labels):
        if cluster_id == -1:
            continue
        cluster_embeds = np.array(df[df['label'] == cluster_id]['embedding'].tolist())
        sims = cosine_similarity(cluster_embeds)
        avg_sim = np.mean(sims[np.triu_indices_from(sims, k=1)])
        consistency_scores[cluster_id] = avg_sim

    return consistency_scores

def summarize_clusters(labels):
    cluster_counts = pd.Series(labels).value_counts()
    print("Cluster sizes:")
    print(cluster_counts[cluster_counts.index != -1])
    print(f"Outliers (cluster -1): {cluster_counts.get(-1, 0)}")

In [51]:
similar_pairs = find_similarities(embeddings, filenames, threshold=0.95)
print(f"Found {len(similar_pairs)} duplicate/similar pairs")

consistency_scores = analyze_cluster_consistency(embeddings, cluster_labels, filenames)
print("\nCluster Consistency Scores:")
for cid, score in consistency_scores.items():
    print(f"Cluster {cid}: Avg Cosine Similarity = {score:.3f}")


Found 281 duplicate/similar pairs

Cluster Consistency Scores:
Cluster 0: Avg Cosine Similarity = 0.606
Cluster 1: Avg Cosine Similarity = 0.753
Cluster 2: Avg Cosine Similarity = 0.762
Cluster 3: Avg Cosine Similarity = 0.675
Cluster 4: Avg Cosine Similarity = 0.609
Cluster 5: Avg Cosine Similarity = 0.573
Cluster 6: Avg Cosine Similarity = 0.616
Cluster 7: Avg Cosine Similarity = 0.546
Cluster 8: Avg Cosine Similarity = 0.519
Cluster 9: Avg Cosine Similarity = 0.582
Cluster 10: Avg Cosine Similarity = 0.702
Cluster 11: Avg Cosine Similarity = 0.680
Cluster 12: Avg Cosine Similarity = 0.777
Cluster 13: Avg Cosine Similarity = 0.564
Cluster 14: Avg Cosine Similarity = 0.665
Cluster 15: Avg Cosine Similarity = 0.522
Cluster 16: Avg Cosine Similarity = 0.637
Cluster 17: Avg Cosine Similarity = 0.530
Cluster 18: Avg Cosine Similarity = 0.619
Cluster 19: Avg Cosine Similarity = 0.624
Cluster 20: Avg Cosine Similarity = 0.550
Cluster 21: Avg Cosine Similarity = 0.726
Cluster 22: Avg Cosine 

In [52]:
outliers = get_outlier_documents(docs, filenames, cluster_labels)

print(f"Found {len(outliers)} outlier documents\n")

for i, out in enumerate(outliers, 1):
    print(f"Outlier {i}:")
    print(f"  Filename: {out['Filename']}")
    print(f"  Length: {out['Length']} characters")
    print(f"  Preview: {out['Preview']}\n")

Found 201 outlier documents

Outlier 1:
  Filename: doc_1.txt
  Length: 67885 characters
  Preview: prepared office assistant secretary planning evaluation department health human service norc university chicago september much trustworthy artificial intelligence centered outcome research r e p r september final report office assistant secretary planning evaluation assistant secretary planning eval...

Outlier 2:
  Filename: doc_1001.txt
  Length: 30423 characters
  Preview: standard artificial intelligence health trustworthiness february notice consumer technology association bulletin technical publication designed serve public interest eliminating misunderstanding manufacturer facilitating interchangeability improvement assisting purchaser selecting obtaining minimum ...

Outlier 3:
  Filename: doc_1013.txt
  Length: 4799 characters
  Preview: artificial intelligence bootcamp june microsoft team registration announcement register artificial intelligence bootcamp ninr artificial intell