In [5]:
import re
import os
import pandas as pd
from docx import Document
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from tqdm import tqdm
import shutil
from pathlib import Path

In [6]:
def collect_latest_docx_files(source_dir: Path, dest_dir: Path):
    dest_dir.mkdir(exist_ok=True)
    folder_names = []

    for item in source_dir.iterdir():
        if item.is_dir():
            folder_name = item.name
            if folder_name in ["Tietievry", "Investors Bank", "Old Republic", "ThoughSpot Partnerships", "PayPal", "Quantum Health"]:
                continue
            docx_files = list(item.glob("*.docx")) + list(item.glob("*.doc"))
            if not docx_files:
                print(f"Skip (no docx/doc found) → {folder_name}")
                continue

            folder_names.append(folder_name)

            try:
                latest_file = max(
                    docx_files,
                    key=lambda f: (f.stat().st_mtime, f.stat().st_size)
                )
            except Exception as e:
                print(f"Skip (error getting latest file) → {folder_name}: {e}")
                continue
            new_filename = f"{folder_name}_{latest_file.name}"
            destination_path = dest_dir / new_filename
            shutil.copy2(latest_file, destination_path)

    return folder_names


if __name__ == "__main__":
    source_folder = Path("NDA DATASET 2023 2024 DG")
    destination_folder = Path("input_ndas")
    
    ORG_PATTERNS = [
        r'\binc\b',
        r'\bcorporation\b',
        r'\bllc\b',
        r'\bcompany\b'
    ]
    
    processed_folders = collect_latest_docx_files(source_folder, destination_folder)
    print(f"Copied {len(processed_folders)} files to '{destination_folder}'")    
    for folder_name in processed_folders:
        clean_name = folder_name.lower()
        escaped_name = re.escape(clean_name)
        regex_pattern = fr'\b{escaped_name}\b'
        ORG_PATTERNS.append(regex_pattern)
    
        

Skip (no docx/doc found) → Starry
Skip (no docx/doc found) → Atheon
Skip (no docx/doc found) → Slickdeals
Skip (no docx/doc found) → Procure
Skip (no docx/doc found) → General Mills
Skip (no docx/doc found) → Preqin
Skip (no docx/doc found) → Ocean Spray
Skip (no docx/doc found) → Tipalti
Skip (no docx/doc found) → Workato
Skip (no docx/doc found) → Digitalizar
Skip (no docx/doc found) → FDA
Skip (no docx/doc found) → Coffee and Bagel
Skip (no docx/doc found) → Risk Strategies
Skip (no docx/doc found) → HDI
Skip (no docx/doc found) → HDSupply
Skip (no docx/doc found) → Boxed
Skip (no docx/doc found) → CIC
Skip (no docx/doc found) → Trumid
Skip (no docx/doc found) → FunPlus
Skip (no docx/doc found) → QCommissions
Skip (no docx/doc found) → Next Games
Skip (no docx/doc found) → Disney Ad Sales
Skip (no docx/doc found) → Performio
Skip (no docx/doc found) → Crocs
Skip (no docx/doc found) → Tessera
Skip (no docx/doc found) → SurveyMonkey
Skip (no docx/doc found) → JD Power
Skip (no docx/do

In [None]:
import os
import re
import pandas as pd
from tqdm import tqdm
from docx import Document
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

ORG_PATTERNS = [
    r'\bpaypal\b',
    r'\bbayer\b',
    r'\bpricenow\b',
    r'\bthoughtspot\b',
    r'\binc\b',
    r'\bcorporation\b',
    r'\bllc\b',
    r'\bcompany\b'
]

ENTITIES_TO_REDACT = [
    "PERSON", "ORGANIZATION", "EMAIL_ADDRESS", "PHONE_NUMBER", "NRP"
]

def pre_clean_orgs(text):
    clean_text = text
    for pat in ORG_PATTERNS:
        clean_text = re.sub(pat, "[REDACTED_ORG]", clean_text, flags=re.IGNORECASE)
    return clean_text

def redact_text(text):
    if not text.strip():
        return ""
    text = pre_clean_orgs(text)
    results = analyzer.analyze(
        text=text,
        language='en',
        entities=ENTITIES_TO_REDACT
    )
    anonymized = anonymizer.anonymize(
        text=text,
        analyzer_results=results,
        operators={"DEFAULT": OperatorConfig("replace", {"new_value": "[REDACTED]"})}
    )
    return anonymized.text



def parse_nda(doc_path, cant_open_doc_num, redact=True):
    try:
        doc = Document(doc_path)
    except Exception as e:
        # print(f"Skipped (cannot open as Word): {os.path.basename(doc_path)} | {e}")
        cant_open_doc_num +=1
        return None

    all_text_body = []
    found_in_body = False

    END_PATTERNS = [
        r'in\s+witness\s+whereof',
        r'in\s+witness\s+thereof',
        r'the\s+signatures\s+follow',
        r'signature\s+page\s+follows',
        r'signed\s+on\s+behalf\s+of',
        r'signed\s+for\s+and\s+on\s+behalf\s+of',
        r'each\s+acting\s+under\s+due\s+and\s+proper\s+authority'
    ]
    end_pattern = re.compile('(' + '|'.join(END_PATTERNS) + ')', re.IGNORECASE)

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue
        all_text_body.append(text)  
        if end_pattern.search(text): 
            found_in_body = True
            break

    if not found_in_body:
        if not doc.tables or len(doc.tables) == 0:
            return None
        found_in_table = False
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    cell_text = cell.text.strip()
                    if not cell_text:
                        continue
                    if end_pattern.search(cell_text):
                        found_in_table = True
                        break
                if found_in_table:
                    break
            if found_in_table:
                break
        if not found_in_table:
            return None

    raw_sentences = []
    raw_para_map = [] 
    for para_text in all_text_body:
        split_sents = re.split(r'(?<=[.?!])\s+(?=[A-Z(])', para_text)
        for s in split_sents:
            s = s.strip()
            if s:
                raw_sentences.append(s)
                raw_para_map.append(para_text) 

    parsed_sentences = []
    parsed_para_map = []
    main_clause = None

    for sent, para in zip(raw_sentences, raw_para_map):
        sent = sent.strip()
        if not sent:
            continue
        if len(sent.split()) < 10 and sent.endswith("."):
            continue
        elif sent.endswith(":"):
            main_clause = sent
        elif main_clause:
            parsed_sentences.append(f"{main_clause} {sent}")
            parsed_para_map.append(para)
            if sent.endswith("."):
                main_clause = None
        else:
            parsed_sentences.append(sent)
            parsed_para_map.append(para)

    def normalize_text(text):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'^[a-z]\)|^\([a-z]\)|^\d+[\.\)]', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    normalized_sentences = [normalize_text(s) for s in parsed_sentences]

    if redact:
        clean_sentences = [redact_text(s) for s in normalized_sentences]
        clean_paragraphs = [redact_text(normalize_text(p)) for p in parsed_para_map]
    else:
        clean_sentences = normalized_sentences
        clean_paragraphs = [normalize_text(p) for p in parsed_para_map]

    df = pd.DataFrame({
        "original_sentence": parsed_sentences,
        "clean_sentence": clean_sentences,
        "clean_paragraph": clean_paragraphs  
    })

    df = df[
        ~((df["original_sentence"].apply(lambda x: len(x.split()) <= 5)) &
          (df["clean_sentence"].apply(lambda x: len(x.split()) <= 5)))
    ].reset_index(drop=True)

    return df


if __name__ == "__main__":
    folder_path = "./input_ndas"
    all_results = []

    analyzer = AnalyzerEngine()
    anonymizer = AnonymizerEngine()

    docx_files = [f for f in os.listdir(folder_path) if f.lower().endswith((".docx", ".doc"))]
    global cant_open_doc_num
    cant_open_doc_num = 0
    no_ending_num = 0
    for filename in tqdm(docx_files, desc="Parsing NDA files", ncols=100):
        file_path = os.path.join(folder_path, filename)
        parsed_df = parse_nda(file_path,cant_open_doc_num,redact=True)

        if parsed_df is not None and not parsed_df.empty:
            parsed_df["source_file"] = filename
            all_results.append(parsed_df)
        else:
            no_ending_num+=1
        #     print(f"Skipped (no ending words found in body or tables): {filename}")

    print()
    print(f"skip {cant_open_doc_num} file that cannot open")
    print(f"skip {no_ending_num} no ending words in file")
    print(f"Processing {len(all_results)} files in total.")

    final_df = pd.concat(all_results, ignore_index=True)
    output_path = "./parsed_nda_para.csv"
    final_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"\nOutput file has been saved to: {output_path}")



Parsing NDA files: 100%|██████████████████████████████████████████| 491/491 [03:20<00:00,  2.45it/s]


skip 0 file that cannot open
skip 241 no ending words in file
Processing 250 files in total.

Output file has been saved to: ./parsed_nda_para.csv



