#### Training the model

Set paths and labels

In [17]:
import os

# Folder containing PDFs
PDF_FOLDER = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/focus_area_classification/test_pdf_folder"
LABEL_CSV = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/focus_area_classification/labeled_data.csv"
TEXT_FOLDER = "/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/focus_area_classification/clean_texts"

# Ensure output text folder exists
os.makedirs(TEXT_FOLDER, exist_ok=True)

# Your 5 AI governance focus areas (labels)
LABELS = [
    "bias_and_fairness",
    "reliability_and_monitoring",
    "privacy_and_security",
    "transparency_and_explainability",
    "responsible_implementation"
]

Extract text from PDFs

In [23]:
import os
import fitz  # PyMuPDF
import pandas as pd
import unicodedata
import re

# Step 1: Extract and clean text from PDFs, save as .txt
def clean_text(text):
    # Normalize Unicode to remove ambiguous characters
    text = unicodedata.normalize('NFKC', text)
    # Remove pipes and carriage returns/line feeds
    text = re.sub(r'[|]', '', text)
    text = re.sub(r'[\r\n]+', ' ', text)
    return text.strip()

def extract_and_save_clean_txts(pdf_folder, text_folder):
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            doc = fitz.open(pdf_path)
            raw_text = "\n".join(page.get_text() for page in doc)
            cleaned = clean_text(raw_text)

            base_name = os.path.splitext(filename)[0]
            txt_path = os.path.join(text_folder, base_name + ".txt")

            with open(txt_path, "w", encoding="utf-8") as f:
                f.write(cleaned)

# Step 2: Generate labeled CSV using .txt files
def create_labeled_csv_from_txts(text_folder, label_csv, labels):
    data = []
    for filename in os.listdir(text_folder):
        if filename.endswith(".txt"):
            txt_path = os.path.join(text_folder, filename)
            with open(txt_path, "r", encoding="utf-8") as f:
                text = f.read()
            entry = {"filename": filename, "text": text}
            for label in labels:
                entry[label] = 0
            data.append(entry)
    df = pd.DataFrame(data)
    df.to_csv(label_csv, index=False, encoding="utf-8", sep='|')  # Use pipe as delimiter
    print(f"Labeled CSV saved to {label_csv} with '|' as the delimiter.")

# ---- Run the full workflow ---- #
LABELS = [
    "bias_and_fairness",
    "reliability_and_monitoring",
    "privacy_and_security",
    "transparency_and_explainability",
    "responsible_implementation"
]

if not os.path.exists(LABEL_CSV):
    extract_and_save_clean_txts(PDF_FOLDER, TEXT_FOLDER)
    create_labeled_csv_from_txts(TEXT_FOLDER, LABEL_CSV, LABELS)
else:
    print(f"{LABEL_CSV} already exists. Proceeding without changes.")


/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/focus_area_classification/labeled_data.csv already exists. Proceeding without changes.


Load and prepare the labeled data

In [None]:
import csv
import sys
import pandas as pd

# Increase allowable field size
csv.field_size_limit(sys.maxsize)

# Load pipe-delimited CSV with large text fields
df = pd.read_csv("labeled_data.csv", sep='|', encoding='utf-8', engine='python')

# Fill NaNs in text column
df['text'] = df['text'].fillna("")

# Separate features and multi-label targets
X_text = df['text']
Y = df[LABELS].values

### somewhere in here, add code to automate tagging of 1s and 0s (one-hot encoding?)

Vectorize using TF-IDF

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(X_text)


Split into training and testing groups

In [38]:
from sklearn.model_selection import train_test_split

# Split both features and labels
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

View class balance/distribution

In [39]:
for i, label in enumerate(LABELS):
    counts = pd.Series(Y_train[:, i]).value_counts()
    print(f"Label '{label}' class distribution:\n{counts}\n")


Label 'bias_and_fairness' class distribution:
1    3
0    2
dtype: int64

Label 'reliability_and_monitoring' class distribution:
1    4
0    1
dtype: int64

Label 'privacy_and_security' class distribution:
1    3
0    2
dtype: int64

Label 'transparency_and_explainability' class distribution:
1    4
0    1
dtype: int64

Label 'responsible_implementation' class distribution:
1    4
0    1
dtype: int64



Train XGBoost classifers

In [40]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import numpy as np

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

models = []
Y_pred = []

for i, label in enumerate(LABELS):
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, Y_train[:, i])
    preds = model.predict(X_test)
    Y_pred.append(preds)
    models.append(model)

Y_pred = np.array(Y_pred).T  # Shape: (samples, labels)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluate the model

In [41]:
from sklearn.metrics import classification_report

print("Classification Report:\n")
print(classification_report(Y_test, Y_pred, target_names=LABELS))

Classification Report:

                                 precision    recall  f1-score   support

              bias_and_fairness       1.00      1.00      1.00         2
     reliability_and_monitoring       0.50      1.00      0.67         1
           privacy_and_security       0.00      0.00      0.00         0
transparency_and_explainability       0.50      1.00      0.67         1
     responsible_implementation       1.00      1.00      1.00         2

                      micro avg       0.60      1.00      0.75         6
                      macro avg       0.60      0.80      0.67         6
                   weighted avg       0.83      1.00      0.89         6
                    samples avg       0.60      1.00      0.75         6



  _warn_prf(average, modifier, msg_start, len(result))


#### Using the trained model to predict focus area classification for unlabeled documents

Clean the unlabeled documents

In [42]:
import os
import fitz  # PyMuPDF
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Load your saved TF-IDF vectorizer and trained models
# (Assumes you've stored them in variables `vectorizer` and `models`)
# If saved to disk, you'd need to use `joblib.load(...)`

# 2. Define the same cleaning function
def clean_text(text):
    import unicodedata
    import re
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'[|]', '', text)
    text = re.sub(r'[\r\n]+', ' ', text)
    return text.strip()


Extract and clean text from unlabeled documents

In [None]:
# 3. Extract and clean text from new PDFs
def extract_texts_from_folder(folder_path):
    docs = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            path = os.path.join(folder_path, filename)
            doc = fitz.open(path)
            raw_text = "\n".join(page.get_text() for page in doc)
            cleaned = clean_text(raw_text)
            docs.append(cleaned)
            filenames.append(filename)
    return filenames, docs

unlabeled_folder = "unlabeled_pdfs"
filenames, texts = extract_texts_from_folder(unlabeled_folder)



Vectorize the text from unlabeled documents

In [None]:
# 4. Vectorize new text using the *trained* vectorizer
X_unlabeled = vectorizer.transform(texts)


Predict using the trained models

In [None]:
# 5. Predict using your trained models (one per label)
predictions = []
for model in models:
    preds = model.predict(X_unlabeled)
    predictions.append(preds)



Display predictions

In [46]:
# 6. Combine into a labeled DataFrame
LABELS = [
    "bias_and_fairness",
    "reliability_and_monitoring",
    "privacy_and_security",
    "transparency_and_explainability",
    "responsible_implementation"
]

import numpy as np
predictions = np.array(predictions).T  # shape: (n_samples, n_labels)

df_preds = pd.DataFrame(predictions, columns=LABELS)
df_preds.insert(0, 'filename', filenames)
df_preds.insert(1, 'text', texts)

# 7. Save to CSV for review
df_preds.to_csv("predicted_labels.csv", index=False, encoding='utf-8', sep='|')
print("Predictions saved to 'predicted_labels.csv'")

Predictions saved to 'predicted_labels.csv'
