In [3]:
import os
import json
import re
import docx
from docx import Document
from docx.oxml.ns import qn
from docx.shared import Pt
import docx2txt
from ollama import Client
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gradio as gr
from docx.oxml import OxmlElement
from docx.text.paragraph import Paragraph

# Suppress TensorFlow warning
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"


def read_docx(file_path):
    try:
        if file_path.endswith('.docx'):
            doc = Document(file_path)
            text = [(i, para.text.strip()) for i, para in enumerate(doc.paragraphs) if para.text.strip()]
            return text, doc
        elif file_path.endswith('.doc'):
            text = docx2txt.process(file_path)
            text = [(i, line.strip()) for i, line in enumerate(text.split('\n')) if
                    line.strip() and not line.startswith('_') and 'EMBED' not in line]
            return text, None
        else:
            print(f"Unsupported file format: {file_path}")
            return [], None
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return [], None


def classify_document(text, file_path):
    doc_types = [
        "Articles of Association",
        "Memorandum of Association",
        "Board Resolution Templates",
        "Shareholder Resolution Templates",
        "Incorporation Application Form",
        "UBO Declaration Form",
        "Register of Members and Directors",
        "Change of Registered Address Notice",
        "Licensing Application",
        "HR Contract"
    ]
    full_text = " ".join([t for _, t in text]).lower()
    if "resolution" in file_path.lower() and "shareholder" in full_text:
        print(f"Detected document type: Shareholder Resolution Templates in {full_text[:50]}...")
        return "Shareholder Resolution Templates"
    for doc_type in doc_types:
        if doc_type.lower() in full_text:
            print(f"Detected document type: {doc_type} in {full_text[:50]}...")
            return doc_type
    print(f"No document type detected in {full_text[:50]}...")
    return "Unknown"


def define_checklists():
    return {
        "Company Incorporation": [
            "Articles of Association",
            "Memorandum of Association",
            "Board Resolution Templates",
            "Shareholder Resolution Templates",
            "Incorporation Application Form",
            "UBO Declaration Form",
            "Register of Members and Directors"
        ],
        "Licensing": [
            "Licensing Application",
            "Articles of Association",
            "UBO Declaration Form"
        ],
        "HR Contracts": [
            "HR Contract",
            "Register of Members and Directors"
        ]
    }


def detect_process(documents):
    checklists = define_checklists()
    for process, required_docs in checklists.items():
        if any(doc in required_docs for doc in documents):
            print(f"Detected process: {process}")
            return process
    print("No process detected")
    return "Unknown"


def check_missing_documents(process, uploaded_docs):
    checklists = define_checklists()
    required_docs = checklists.get(process, [])
    missing = [doc for doc in required_docs if doc not in uploaded_docs]
    print(f"Missing documents: {missing}")
    return missing


# --- FIXED COMMENT FUNCTION ---
def _insert_paragraph_after(paragraph, text="", style=None):
    """
    Insert a new paragraph directly after the given paragraph.
    """
    new_p = OxmlElement("w:p")
    paragraph._p.addnext(new_p)
    new_para = Paragraph(new_p, paragraph._parent)
    if style:
        try:
            new_para.style = style
        except Exception:
            pass
    run = new_para.add_run(text)
    run.italic = True
    run.font.size = Pt(9)
    return new_para


def add_docx_comment(doc, para_index, comment_text):
    """
    Add an inline comment paragraph after the specified paragraph index.
    Replaces unsupported doc.comments.add().
    """
    try:
        if para_index < 0 or para_index >= len(doc.paragraphs):
            para = doc.paragraphs[-1]
        else:
            para = doc.paragraphs[para_index]
        _insert_paragraph_after(para, f"COMMENT (ADGM Compliance Bot): {comment_text}")
    except Exception as e:
        print(f"Error adding inline comment to paragraph {para_index}: {e}")


# --- END FIX ---


def check_red_flags(text, doc_type, doc):
    issues = []
    current_section = "Unknown"
    for i, line in text:
        if re.match(r"(PART \d+[A-Z]?|Clause \d+\.\d+|Section \d+\.\d+|Article \d+)", line, re.IGNORECASE):
            current_section = line.strip()

    for para_index, line in text:
        if re.search(r"UAE Federal Courts", line, re.IGNORECASE):
            issue = {
                "document": doc_type,
                "section": current_section,
                "issue": "Jurisdiction clause does not specify ADGM",
                "severity": "High",
                "suggestion": "Update jurisdiction to ADGM Courts."
            }
            issues.append(issue)
            if doc:
                add_docx_comment(doc, para_index, issue["suggestion"])
            break

    if doc_type in ["Articles of Association", "Memorandum of Association"]:
        governing_law_found = False
        for para_index, line in text:
            if re.search(r"governing law|ADGM regulations", line, re.IGNORECASE):
                governing_law_found = True
                break
        if not governing_law_found:
            issue = {
                "document": doc_type,
                "section": current_section,
                "issue": "No governing law clause found",
                "severity": "Medium",
                "suggestion": "Add clause specifying ADGM regulations as governing law."
            }
            issues.append(issue)
            if doc:
                add_docx_comment(doc, text[0][0], issue["suggestion"])

    try:
        client = Client()
        full_text = " ".join([t for _, t in text])[:500]
        prompt = (
            f"Analyze the following legal document text for ambiguous or non-binding language "
            f"(e.g., use of 'may' instead of 'shall'). Identify the specific sentence or phrase, "
            f"its approximate location (e.g., paragraph number), and suggest a fix. "
            f"If none, say 'No ambiguous language detected.' Text: {full_text}"
        )
        response = client.generate(model="llama3.1:8b", prompt=prompt)
        llm_response = response["response"]
        print(f"LLM response for {doc_type}: {llm_response[:100]}...")
        if "ambiguous" in llm_response.lower() or "non-binding" in llm_response.lower():
            para_match = re.search(r"paragraph (\d+)", llm_response, re.IGNORECASE)
            para_index = int(para_match.group(1)) if para_match and int(para_match.group(1)) < len(text) else text[0][0]
            issue = {
                "document": doc_type,
                "section": current_section,
                "issue": "Ambiguous or non-binding language detected",
                "severity": "Medium",
                "suggestion": llm_response if len(llm_response) < 200 else llm_response[:200] + "..."
            }
            issues.append(issue)
            if doc:
                add_docx_comment(doc, para_index, issue["suggestion"])
    except Exception as e:
        print(f"LLM error for {doc_type}: {e}. Skipping LLM checks.")
        issue = {
            "document": doc_type,
            "section": current_section,
            "issue": "LLM check failed due to connection error",
            "severity": "Low",
            "suggestion": "Ensure Ollama is running and retry."
        }
        issues.append(issue)
        if doc:
            add_docx_comment(doc, text[0][0], issue["suggestion"])

    return issues


def create_adgm_corpus():
    return [

        "ADGM companies must specify ADGM Courts as the governing jurisdiction, pursuant to Abu Dhabi Law No. 4 of 2013 and ADGM Courts Regulations 2015, as enacted on 11 December 2015, per https://www.adgm.com/legal-framework/rules-and-regulations.",
        "Articles of Association must include a clause on share capital allocation, detailing share class and nominal value, as per ADGM Companies Regulations 2020, per https://www.adgm.com/registration-authority/registration-and-incorporation.",
        "Memorandum of Association must include a clear statement of company objectives compliant with ADGM Companies Regulations 2020, per https://www.adgm.com/registration-authority/registration-and-incorporation.",
        "All legal documents must use binding language, e.g., 'shall' instead of 'may', to comply with ADGM legal drafting standards based on English Common Law, per https://www.adgm.com/legal-framework/guidance-and-policy-statements.",
        "Shareholder Resolution Templates must confirm the appointment of directors and authorized signatories, as required by ADGM Companies Regulations 2020, Part 12, per https://assets.adgm.com/downloads/adgm-ra-resolution-multiple-incorporate-shareholders-LTD-incorporation-v2.docx.",
        "Memorandum of Association must specify the company’s registered office in ADGM, as mandated by ADGM Companies Regulations 2020, per https://www.adgm.com/registration-authority/registration-and-incorporation.",
        "Directors’ appointments must include at least one natural person of legal age, as per ADGM Companies Regulations 2020, Part 12, per https://www.adgm.com/registration-authority/registration-and-incorporation.",
        "Financial Services and Markets Regulations (FSMR) establish the legislative framework for financial services in ADGM, requiring compliance with FSRA Rulebooks and Guidance, per https://www.adgm.com/legal-framework/guidance-and-policy-statements.",
        "ADGM Courts Procedure Rules 2016, enacted on 30 May 2016, must be followed for all legal proceedings in ADGM, per https://www.adgm.com/legal-framework/guidance-and-policy-statements.",
        "Employment contracts must comply with ADGM Employment Regulations 2024, effective 1 April 2025, specifying terms like notice periods, employee rights, and overtime provisions, per https://assets.adgm.com/downloads/ADGM+Standard+Employment+Contract+Template+-+ER+2024+(Feb+2025).docx.",
        "Licensing applications must include a business plan and financial projections, as per ADGM FSRA Guidance on Licensing Applications, per https://www.adgm.com/legal-framework/guidance-and-policy-statements.",
        "Whistleblowing frameworks must be implemented for transparency and accountability, as per ADGM Supplementary Guidance on Whistleblowing (July 2025), issued under FSMR section 15(2), per https://www.adgm.com/legal-framework/guidance-and-policy-statements.",
        "Shareholder resolutions amending Articles of Association must comply with ADGM Companies Regulations 2020, per https://assets.adgm.com/downloads/Templates_SHReso_AmendmentArticles-v1-20220107.docx.",
        "ADGM registered entities must obtain Abu Dhabi Government Authorities approval for retail businesses prior to operation, as per Onshore Government Authorities Approval Guidelines, per https://www.adgm.com/legal-framework/guidance-and-policy-statements.",
        "Annual accounts must be filed by ADGM entities in compliance with ADGM Companies Regulations 2020, per https://www.adgm.com/operating-in-adgm/obligations-of-adgm-registered-entities/annual-filings/annual-accounts.",
        "Data protection policies must adhere to ADGM Data Protection Regulations 2021, requiring an Appropriate Policy Document, per https://www.adgm.com/documents/office-of-data-protection/templates/adgm-dpr-2021-appropriate-policy-document.pdf."

    ]


def setup_rag():
    try:
        model = SentenceTransformer('all-MiniLM-L6-v2')
        corpus = create_adgm_corpus()
        embeddings = model.encode(corpus)
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)
        return model, index, corpus
    except Exception as e:
        print(f"RAG setup error: {e}. Using regex checks only.")
        return None, None, create_adgm_corpus()


def check_compliance(text, doc_type, model, index, corpus, doc):
    issues = []
    full_text = " ".join([t for _, t in text])[:500]
    current_section = "Unknown"
    for i, line in text:
        if re.match(r"(PART \d+[A-Z]?|Clause \d+\.\d+|Section \d+\.\d+|Article \d+)", line, re.IGNORECASE):
            current_section = line.strip()
    try:
        doc_embedding = model.encode([full_text])[0]
        _, indices = index.search(np.array([doc_embedding]), k=2)
        relevant_regs = [corpus[i] for i in indices[0]]
        client = Client()
        prompt = (
            f"Check if the following document text complies with these ADGM regulations: {relevant_regs}. "
            f"List any non-compliance issues, specify the sentence or phrase, and its approximate location "
            f"(e.g., paragraph number). Suggest fixes. If none, say 'No non-compliance issues detected.' "
            f"Text: {full_text}"
        )
        response = client.generate(model="llama3.1:8b", prompt=prompt)
        llm_response = response["response"]
        print(f"RAG LLM response for {doc_type}: {llm_response[:100]}...")
        if "non-compliance" in llm_response.lower():
            issue_lines = llm_response.split('\n')
            for issue_line in issue_lines:
                if "non-compliance" in issue_line.lower():
                    para_match = re.search(r"paragraph (\d+)", issue_line, re.IGNORECASE)
                    if para_match and int(para_match.group(1)) < len(text):
                        para_index = int(para_match.group(1))
                        suggestion = issue_line if len(issue_line) < 200 else issue_line[:200] + "..."
                    else:
                        phrase_match = re.search(r"['\"]([^'\"]+)['\"]", issue_line)
                        phrase = phrase_match.group(1) if phrase_match else None
                        para_index = text[0][0]
                        suggestion = issue_line if len(issue_line) < 200 else issue_line[:200] + "..."
                        if phrase:
                            for i, line in text:
                                if phrase.lower() in line.lower():
                                    para_index = i
                                    break
                    issue = {
                        "document": doc_type,
                        "section": current_section,
                        "issue": "Non-compliance with ADGM regulations",
                        "severity": "High",
                        "suggestion": suggestion
                    }
                    issues.append(issue)
                    if doc:
                        add_docx_comment(doc, para_index, issue["suggestion"])
    except Exception as e:
        print(f"RAG error for {doc_type}: {e}. Skipping RAG checks.")
        issue = {
            "document": doc_type,
            "section": current_section,
            "issue": "RAG check failed due to LLM connection error",
            "severity": "Low",
            "suggestion": "Ensure Ollama is running and retry."
        }
        issues.append(issue)
        if doc:
            add_docx_comment(doc, text[0][0], issue["suggestion"])
    return issues


def process_documents(file_paths):
    model, index, corpus = setup_rag()
    uploaded_docs = []
    all_issues = []
    commented_files = []
    for file_path in file_paths:
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        text, doc = read_docx(file_path)
        if text:
            doc_type = classify_document(text, file_path)
            uploaded_docs.append(doc_type)
            issues = check_red_flags(text, doc_type, doc)
            if model and index:
                compliance_issues = check_compliance(text, doc_type, model, index, corpus, doc)
                issues.extend(compliance_issues)
            all_issues.extend(issues)
            if doc:
                commented_path = f"commented_{os.path.basename(file_path)}"
                doc.save(commented_path)
                commented_files.append(commented_path)
    process = detect_process(uploaded_docs)
    missing_docs = check_missing_documents(process, uploaded_docs)
    required_count = len(define_checklists().get(process, []))
    uploaded_count = len(set(uploaded_docs))
    if process == "Unknown":
        message = "Unable to detect legal process. Please upload relevant ADGM documents."
    else:
        if not missing_docs:
            message = f"It appears that you’re trying to {process.lower()}. All required documents ({required_count}) have been uploaded."
        else:
            missing_str = ", ".join(missing_docs)
            message = (f"It appears that you’re trying to {process.lower()}. "
                       f"Based on our reference list, you have uploaded {uploaded_count} out of {required_count} required documents. "
                       f"The missing document(s): {missing_str}.")
    result = {
        "process": process,
        "documents_uploaded": list(set(uploaded_docs)),
        "documents_uploaded_count": uploaded_count,
        "required_documents_count": required_count,
        "missing_documents": missing_docs,
        "issues_found": all_issues,
        "message": message
    }
    with open("compliance_output.json", "w") as f:
        json.dump(result, f, indent=2)
    return result, commented_files


def gradio_interface(files):
    if not files:
        return {"error": "No files uploaded"}, []
    result, commented_files = process_documents(files)
    return result, commented_files


with gr.Blocks() as demo:
    gr.Markdown("# ADGM Corporate Agent Tool")
    gr.Markdown(
        "Upload .docx files to check compliance with ADGM regulations. Download commented files with flagged issues.")
    file_input = gr.File(file_count="multiple", file_types=[".docx"], label="Upload Documents")
    output_json = gr.JSON(label="Compliance Output")
    output_files = gr.File(label="Commented Documents")
    submit_button = gr.Button("Process Documents")
    submit_button.click(
        fn=gradio_interface,
        inputs=file_input,
        outputs=[output_json, output_files]
    )

if __name__ == "__main__":
    sample_files = [
        "adgm-ra-model-articles-private-company-limited-by-shares.docx",
        "adgm-ra-resolution-multiple-incorporate-shareholders-LTD-incorporation-v2.docx"
    ]
    result, commented_files = process_documents(sample_files)
    print("Process:", result["process"])
    print("Uploaded Documents:", result["documents_uploaded"])
    print("Uploaded Count:", result["documents_uploaded_count"])
    print("Required Count:", result["required_documents_count"])
    print("Missing Documents:", result["missing_documents"])
    print("Issues Found:", result["issues_found"])
    print("Message:", result["message"])
    print("Commented Files:", commented_files)
    print("Output saved to compliance_output.json")
    demo.launch()


Detected document type: Articles of Association in articles of association private company limited by...
LLM response for Articles of Association: Here are the results of analyzing the document text for ambiguous or non-binding language:

1. **"Di...
RAG LLM response for Articles of Association: After analyzing the provided text, I found several non-compliance issues with the ADGM regulations. ...
Detected document type: Shareholder Resolution Templates in resolution of incorporating shareholders of [inser...
LLM response for Shareholder Resolution Templates: I've identified some potentially ambiguous language in the document. Please note that I'll be provid...
RAG LLM response for Shareholder Resolution Templates: Based on the provided text, I've identified potential non-compliance issues with the ADGM regulation...
Detected process: Company Incorporation
Missing documents: ['Memorandum of Association', 'Board Resolution Templates', 'Incorporation Application Form', 'UBO Declaration F