In [None]:
# !pip install presidio-analyzer presidio-anonymizer "spacy>=3.0.0,<4.0.0" python-docx
# !python -m spacy download en_core_web_lg
import os
import re
import zipfile
import pandas as pd
from tqdm import tqdm
from lxml import etree
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

ENTITIES_TO_REDACT = [
    "PERSON", "ORGANIZATION", "EMAIL_ADDRESS", "PHONE_NUMBER",
    "DATE_TIME", "URL", "NRP"
]

ORG_PATTERNS = [
    r'\binc\b',
    r'\bcorporation\b',
    r'\bllc\b',
    r'\bcompany\b',
    r'tietoevry',
    r'evry usa',
    r'investors bank',
    r'thoughtspot',
    r'old republic general',
    r'orgs',
    r'vendor',
    r'paypal',
    r'quantum health'
]

NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

VALID_CATEGORIES = {
    "CONFIDENTIALITY OBLIGATIONS",
    "REMEDIES",
    "PRIVACY",
    "LIMITATION OF LIABILITY",
    "NON-COMPETITION",
    "NON-SOLICITATION",
    "INDEMNIFICATION",
    "GOVERNING LAW",
    "SIGNATURES",
}

def read_comments_xml(zipf):
    candidates = [f for f in zipf.namelist() if "comments" in f and f.endswith(".xml")]
    if not candidates:
        return {}
    chosen = next((c for c in candidates if c.endswith("comments.xml")), candidates[0])
    xml = zipf.read(chosen)
    tree = etree.fromstring(xml)
    comments = {}
    for c in tree.findall(".//w:comment", namespaces=NS):
        cid = c.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id")
        texts = c.xpath(".//w:t/text()", namespaces=NS)
        full = " ".join([t.strip() for t in texts if t and t.strip()])
        comments[cid] = full
    return comments

def extract_doc_sentences_and_map(doc_xml):
    tree = etree.fromstring(doc_xml)
    map_id_to_text = {}
    map_id_to_para_texts_for_ref = {}
    for p in tree.findall(".//w:p", namespaces=NS):
        para_text = "".join(p.xpath(".//w:t/text()", namespaces=NS)).strip()
        active_ids = []
        for node in p.iter():
            tag = etree.QName(node).localname
            if tag == "commentRangeStart":
                cid = node.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id")
                if cid and cid not in active_ids:
                    active_ids.append(cid)
                    map_id_to_text.setdefault(cid, [])
            elif tag == "commentRangeEnd":
                cid = node.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id")
                if cid and cid in active_ids:
                    active_ids.remove(cid)
            elif tag == "commentReference":
                cid = node.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id")
                if cid:
                    map_id_to_para_texts_for_ref.setdefault(cid, set()).add(para_text)
            elif tag == "t":
                txt = node.text
                if txt and txt.strip():
                    for cid in active_ids:
                        map_id_to_text.setdefault(cid, []).append(txt.strip())
    merged = {cid: " ".join(txts).strip() for cid, txts in map_id_to_text.items()}
    return merged, map_id_to_para_texts_for_ref

def parse_category_and_flag(comment_text):
    if not comment_text:
        return None, None
    lines = [ln.strip() for ln in comment_text.strip().splitlines() if ln.strip()]
    last_line = lines[-1] if lines else comment_text.strip()
    text = last_line.strip()
    pattern = r"^(?P<cat>.*?)\s*[-:]\s*(?P<flag>Flag|No\s*Flag)\b"
    m = re.match(pattern, text, re.IGNORECASE)
    if not m:
        return None, None
    cat_raw = m.group("cat").strip()
    flag_raw = m.group("flag").strip().title()
    if any(cat_raw.lower() == valid.lower() for valid in VALID_CATEGORIES):
        return cat_raw, "Flag" if flag_raw.lower().startswith("flag") else "No Flag"
    return None, None

def pre_clean_orgs(text):
    clean_text = text
    for pat in ORG_PATTERNS:
        clean_text = re.sub(pat, "[REDACTED_ORG]", clean_text, flags=re.IGNORECASE)
    return clean_text

def redact_text(text):
    if not text.strip():
        return ""
    text = pre_clean_orgs(text)
    results = analyzer.analyze(text=text, language="en", entities=ENTITIES_TO_REDACT)
    anonymized = anonymizer.anonymize(
        text=text,
        analyzer_results=results,
        operators={"DEFAULT": OperatorConfig("replace", {"new_value": "[REDACTED]"})}
    )
    return anonymized.text

def clean_sentence(text):
    text = text.strip().lower()
    text = re.sub(r'\s+', ' ', text)
    text = redact_text(text)
    return text

def process_docx_file(path):
    rows = []
    with zipfile.ZipFile(path, "r") as z:
        comments_map = read_comments_xml(z)
        if "word/document.xml" not in z.namelist():
            return rows
        doc_xml = z.read("word/document.xml")
        map_id_to_text, map_id_to_para_texts_for_ref = extract_doc_sentences_and_map(doc_xml)

        for cid, comment_text in comments_map.items():
            category, flag = parse_category_and_flag(comment_text)
            if not category:
                continue

            if cid in map_id_to_text and map_id_to_text[cid].strip():
                sentence = map_id_to_text[cid].strip()
            else:
                paras = map_id_to_para_texts_for_ref.get(cid, set())
                sentence = next((p for p in paras if p.strip()), "")

            paras = map_id_to_para_texts_for_ref.get(cid, set())
            paragraph = next((p for p in paras if p.strip()), "")

            clean_sent = clean_sentence(sentence)
            clean_para = clean_sentence(paragraph)

            rows.append({
                "source_file": os.path.basename(path),
                "original_sentence": sentence,
                "clean_sentence": clean_sent,
                # "paragraph": paragraph,
                "clean_paragraph": clean_para,
                "category": category,
                "Is_Flag": flag
            })
    return rows

def process_folder(folder):
    all_rows = []
    for fname in tqdm(os.listdir(folder), desc="Processing files"):
        if fname.lower().endswith(".docx"):
            fpath = os.path.join(folder, fname)
            try:
                rows = process_docx_file(fpath)
                all_rows.extend(rows)
            except Exception as e:
                print(f"Error processing {fname}: {e}")

    df = pd.DataFrame(all_rows)
    return df

if __name__ == "__main__":
    folder = "Annotated Documents"
    out_csv = "annotated_nda_para.csv"
    df = process_folder(folder)
    df = df[~((df["Is_Flag"].str.lower() == "no flag") &
              (df["clean_sentence"].str.split().str.len() < 3))]
    df.to_csv(out_csv, index=False)
    print(f"Saved {len(df)} rows to {out_csv}")


Processing files: 100%|██████████| 8/8 [00:04<00:00,  1.77it/s]

Saved 230 rows to annotated_nda_para.csv





In [None]:
import os
import re
import zipfile
import pandas as pd
from tqdm import tqdm
from lxml import etree
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

ENTITIES_TO_REDACT = [
    "PERSON", "LOCATION", "ORGANIZATION", "EMAIL_ADDRESS", "PHONE_NUMBER",
    "DATE_TIME", "URL", "NRP"
]

ORG_PATTERNS = [
    r'\binc\b',
    r'\bcorporation\b',
    r'\bllc\b',
    r'\bcompany\b',
    r'tietoevry',
    r'evry usa',
    r'investors bank',
    r'thoughtspot',
    r'old republic general',
    r'orgs',
    r'vendor',
    r'paypal',
    r'quantum health'
]

NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

VALID_CATEGORIES = {
    "CONFIDENTIALITY OBLIGATIONS",
    "REMEDIES",
    "PRIVACY",
    "LIMITATION OF LIABILITY",
    "NON-COMPETITION",
    "NON-SOLICITATION",
    "INDEMNIFICATION",
    "GOVERNING LAW",
    "SIGNATURES",
}

def read_comments_xml(zipf):
    candidates = [f for f in zipf.namelist() if "comments" in f and f.endswith(".xml")]
    if not candidates:
        return {}
    chosen = next((c for c in candidates if c.endswith("comments.xml")), candidates[0])
    xml = zipf.read(chosen)
    tree = etree.fromstring(xml)
    comments = {}
    for c in tree.findall(".//w:comment", namespaces=NS):
        cid = c.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id")
        texts = c.xpath(".//w:t/text()", namespaces=NS)
        full = " ".join([t.strip() for t in texts if t and t.strip()])
        comments[cid] = full
    return comments

def read_numbering(zipf):
    numbering = {}
    if "word/numbering.xml" not in zipf.namelist():
        return numbering
    xml = zipf.read("word/numbering.xml")
    tree = etree.fromstring(xml)
    for abstract_num in tree.findall("w:abstractNum", namespaces=NS):
        abs_id = abstract_num.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}abstractNumId")
        lvl_texts = {}
        for lvl in abstract_num.findall("w:lvl", namespaces=NS):
            ilvl = lvl.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}ilvl")
            text = lvl.find("w:lvlText", namespaces=NS)
            if text is not None:
                lvl_texts[ilvl] = text.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val")
        numbering[abs_id] = lvl_texts
    num_map = {}
    for num in tree.findall("w:num", namespaces=NS):
        num_id = num.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}numId")
        abs_id = num.find("w:abstractNumId", namespaces=NS)
        if abs_id is not None:
            num_map[num_id] = numbering.get(abs_id.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"), {})
    return num_map

def extract_doc_sentences_and_map(doc_xml, numbering_map):
    tree = etree.fromstring(doc_xml)
    map_id_to_text = {}
    map_id_to_para_texts_for_ref = {}
    for p in tree.findall(".//w:p", namespaces=NS):
        numPr = p.find(".//w:numPr", namespaces=NS)
        prefix = ""
        if numPr is not None:
            num_id_el = numPr.find("w:numId", namespaces=NS)
            ilvl_el = numPr.find("w:ilvl", namespaces=NS)
            if num_id_el is not None and ilvl_el is not None:
                num_id = num_id_el.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val")
                ilvl = ilvl_el.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val")
                lvl_texts = numbering_map.get(num_id, {})
                if ilvl in lvl_texts:
                    prefix = lvl_texts[ilvl].replace("%1", str(1))  # 简化为 1
        para_text = "".join(p.xpath(".//w:t/text()", namespaces=NS)).strip()
        if prefix and para_text:
            para_text = prefix + " " + para_text
        active_ids = []
        for node in p.iter():
            tag = etree.QName(node).localname
            if tag == "commentRangeStart":
                cid = node.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id")
                if cid and cid not in active_ids:
                    active_ids.append(cid)
                    map_id_to_text.setdefault(cid, [])
            elif tag == "commentRangeEnd":
                cid = node.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id")
                if cid and cid in active_ids:
                    active_ids.remove(cid)
            elif tag == "commentReference":
                cid = node.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id")
                if cid:
                    map_id_to_para_texts_for_ref.setdefault(cid, set()).add(para_text)
            elif tag == "t":
                txt = node.text
                if txt and txt.strip():
                    for cid in active_ids:
                        map_id_to_text.setdefault(cid, []).append(txt.strip())
    merged = {cid: " ".join(txts).strip() for cid, txts in map_id_to_text.items()}
    return merged, map_id_to_para_texts_for_ref

def parse_category_and_flag(comment_text):
    if not comment_text:
        return None, None
    lines = [ln.strip() for ln in comment_text.strip().splitlines() if ln.strip()]
    last_line = lines[-1] if lines else comment_text.strip()
    text = last_line.strip()
    pattern = r"^(?P<cat>.*?)\s*[-:]\s*(?P<flag>Flag|No\s*Flag)\b"
    m = re.match(pattern, text, re.IGNORECASE)
    if not m:
        return None, None
    cat_raw = m.group("cat").strip()
    flag_raw = m.group("flag").strip().title()
    if any(cat_raw.lower() == valid.lower() for valid in VALID_CATEGORIES):
        return cat_raw, "Flag" if flag_raw.lower().startswith("flag") else "No Flag"
    return None, None

def pre_clean_orgs(text):
    clean_text = text
    for pat in ORG_PATTERNS:
        clean_text = re.sub(pat, "[REDACTED_ORG]", clean_text, flags=re.IGNORECASE)
    return clean_text

def redact_text(text):
    if not text.strip():
        return ""
    text = pre_clean_orgs(text)
    results = analyzer.analyze(text=text, language="en", entities=ENTITIES_TO_REDACT)
    anonymized = anonymizer.anonymize(
        text=text,
        analyzer_results=results,
        operators={"DEFAULT": OperatorConfig("replace", {"new_value": "[REDACTED]"})}
    )
    return anonymized.text

def clean_sentence(text):
    text = text.strip().lower()
    text = re.sub(r'\s+', ' ', text)
    text = redact_text(text)
    return text

def process_docx_file(path):
    rows = []
    with zipfile.ZipFile(path, "r") as z:
        comments_map = read_comments_xml(z)
        numbering_map = read_numbering(z)
        if "word/document.xml" not in z.namelist():
            return rows
        doc_xml = z.read("word/document.xml")
        map_id_to_text, map_id_to_para_texts_for_ref = extract_doc_sentences_and_map(doc_xml, numbering_map)
        for cid, comment_text in comments_map.items():
            category, flag = parse_category_and_flag(comment_text)
            if not category:
                continue
            if cid in map_id_to_text and map_id_to_text[cid].strip():
                sentence = map_id_to_text[cid].strip()
            else:
                paras = map_id_to_para_texts_for_ref.get(cid, set())
                sentence = next((p for p in paras if p.strip()), "")
            paras = map_id_to_para_texts_for_ref.get(cid, set())
            paragraph = next((p for p in paras if p.strip()), "")
            clean_sent = clean_sentence(sentence)
            clean_para = clean_sentence(paragraph)
            rows.append({
                "source_file": os.path.basename(path),
                "original_sentence": sentence,
                "clean_sentence": clean_sent,
                "clean_paragraph": clean_para,
                "category": category,
                "Is_Flag": flag
            })
    return rows

def process_folder(folder):
    all_rows = []
    for fname in tqdm(os.listdir(folder), desc="Processing files"):
        if fname.lower().endswith(".docx"):
            fpath = os.path.join(folder, fname)
            try:
                rows = process_docx_file(fpath)
                all_rows.extend(rows)
            except Exception as e:
                print(f"Error processing {fname}: {e}")
    df = pd.DataFrame(all_rows)
    return df

if __name__ == "__main__":
    folder = "Annotated Documents"
    out_csv = "annotated_nda_para.csv"
    df = process_folder(folder)
    df = df[~((df["Is_Flag"].str.lower() == "no flag") &
              (df["clean_sentence"].str.split().str.len() < 3))]
    df.to_csv(out_csv, index=False)
    print(f"Saved {len(df)} rows to {out_csv}")
