In [2]:
import os
import json
import re
import docx
from docx import Document
import docx2txt
from ollama import Client
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gradio as gr

# Suppress TensorFlow warning
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

def read_docx(file_path):
    try:
        if file_path.endswith('.docx'):
            doc = Document(file_path)
            text = [(i, para.text.strip()) for i, para in enumerate(doc.paragraphs) if para.text.strip()]
            return text, doc
        elif file_path.endswith('.doc'):
            text = docx2txt.process(file_path)
            text = [(i, line.strip()) for i, line in enumerate(text.split('\n')) if
                    line.strip() and not line.startswith('_') and 'EMBED' not in line]
            return text, None
        else:
            print(f"Unsupported file format: {file_path}")
            return [], None
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return [], None

def classify_document(text, file_path):
    doc_types = [
        "Articles of Association",
        "Memorandum of Association",
        "Board Resolution Templates",
        "Shareholder Resolution Templates",
        "Incorporation Application Form",
        "UBO Declaration Form",
        "Register of Members and Directors",
        "Change of Registered Address Notice",
        "Licensing Application",
        "HR Contract"
    ]
    full_text = " ".join([t for _, t in text]).lower()
    if "resolution" in file_path.lower() and "shareholder" in full_text:
        print(f"Detected document type: Shareholder Resolution Templates in {full_text[:50]}...")
        return "Shareholder Resolution Templates"
    for doc_type in doc_types:
        if doc_type.lower() in full_text:
            print(f"Detected document type: {doc_type} in {full_text[:50]}...")
            return doc_type
    print(f"No document type detected in {full_text[:50]}...")
    return "Unknown"

def define_checklists():
    return {
        "Company Incorporation": [
            "Articles of Association",
            "Memorandum of Association",
            "Board Resolution Templates",
            "Shareholder Resolution Templates",
            "Incorporation Application Form",
            "UBO Declaration Form",
            "Register of Members and Directors"
        ],
        "Licensing": [
            "Licensing Application",
            "Articles of Association",
            "UBO Declaration Form"
        ],
        "HR Contracts": [
            "HR Contract",
            "Register of Members and Directors"
        ]
    }

def detect_process(documents):
    checklists = define_checklists()
    for process, required_docs in checklists.items():
        if any(doc in required_docs for doc in documents):
            print(f"Detected process: {process}")
            return process
    print("No process detected")
    return "Unknown"

def check_missing_documents(process, uploaded_docs):
    checklists = define_checklists()
    required_docs = checklists.get(process, [])
    missing = [doc for doc in required_docs if doc not in uploaded_docs]
    print(f"Missing documents: {missing}")
    return missing

def check_red_flags(text, doc_type, doc):
    issues = []
    section_map = {}
    current_section = "Unknown"
    for i, line in text:
        if re.match(r"(PART \d+[A-Z]?|Clause \d+\.\d+|Section \d+\.\d+|Article \d+)", line, re.IGNORECASE):
            current_section = line.strip()
        section_map[i] = current_section

    for i, line in text:
        if re.search(r"UAE Federal Courts", line, re.IGNORECASE):
            issues.append({
                "document": doc_type,
                "section": section_map[i],
                "issue": "Jurisdiction clause does not specify ADGM",
                "severity": "High",
                "suggestion": "Update jurisdiction to ADGM Courts."
            })
            break

    if doc_type in ["Articles of Association", "Memorandum of Association"]:
        governing_law_found = any(re.search(r"governing law|ADGM regulations", line, re.IGNORECASE) for _, line in text)
        if not governing_law_found:
            issues.append({
                "document": doc_type,
                "section": section_map.get(0, "Unknown"),
                "issue": "No governing law clause found",
                "severity": "Medium",
                "suggestion": "Add clause specifying ADGM regulations as governing law."
            })

    try:
        client = Client()
        full_text = " ".join([t for _, t in text])[:500]
        prompt = (
            f"Analyze the following legal document text for ambiguous or non-binding language "
            f"(e.g., use of 'may' instead of 'shall'). Identify the specific sentence or phrase, "
            f"its approximate location (paragraph number), and suggest a fix. "
            f"If none, say 'No ambiguous language detected.' Text: {full_text}"
        )
        response = client.generate(model="llama3.1:8b", prompt=prompt)
        llm_response = response["response"]
        print(f"LLM response for {doc_type}: {llm_response[:100]}...")
        if "ambiguous" in llm_response.lower() or "non-binding" in llm_response.lower():
            para_match = re.search(r"paragraph (\d+)", llm_response, re.IGNORECASE)
            para_index = int(para_match.group(1)) if para_match and int(para_match.group(1)) < len(text) else 0
            issues.append({
                "document": doc_type,
                "section": section_map.get(para_index, "Unknown"),
                "issue": "Ambiguous or non-binding language detected",
                "severity": "Medium",
                "suggestion": llm_response if len(llm_response) < 200 else llm_response[:200] + "..."
            })
    except Exception as e:
        print(f"LLM error for {doc_type}: {e}. Skipping LLM checks.")
        issues.append({
            "document": doc_type,
            "section": section_map.get(0, "Unknown"),
            "issue": "LLM check failed due to connection error",
            "severity": "Low",
            "suggestion": "Ensure Ollama is running and retry."
        })
    return issues

def create_adgm_corpus():
    return [
        "ADGM companies must specify ADGM Courts as the governing jurisdiction...",
        "Articles of Association must include a clause on share capital allocation...",
        "Memorandum of Association must include a clear statement of company objectives...",
        "All legal documents must use binding language, e.g., 'shall' instead of 'may'...",
        "Shareholder Resolution Templates must confirm the appointment of directors...",
        "Memorandum of Association must specify the company’s registered office in ADGM...",
        "Directors’ appointments must include at least one natural person...",
        "Financial Services and Markets Regulations (FSMR) establish the legislative framework...",
        "ADGM Courts Procedure Rules 2016 must be followed for all legal proceedings...",
        "Employment contracts must comply with ADGM Employment Regulations 2024...",
        "Licensing applications must include a business plan and financial projections...",
        "Whistleblowing frameworks must be implemented...",
        "Shareholder resolutions amending Articles of Association must comply...",
        "ADGM registered entities must obtain Abu Dhabi Government Authorities approval...",
        "Annual accounts must be filed by ADGM entities...",
        "Data protection policies must adhere to ADGM Data Protection Regulations 2021..."
    ]

def setup_rag():
    try:
        model = SentenceTransformer('all-MiniLM-L6-v2')
        corpus = create_adgm_corpus()
        embeddings = model.encode(corpus)
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)
        return model, index, corpus
    except Exception as e:
        print(f"RAG setup error: {e}. Using regex checks only.")
        return None, None, create_adgm_corpus()

def check_compliance(text, doc_type, model, index, corpus, doc):
    issues = []
    full_text = " ".join([t for _, t in text])[:500]
    section_map = {}
    current_section = "Unknown"
    for i, line in text:
        if re.match(r"(PART \d+[A-Z]?|Clause \d+\.\d+|Section \d+\.\d+|Article \d+)", line, re.IGNORECASE):
            current_section = line.strip()
        section_map[i] = current_section
    try:
        doc_embedding = model.encode([full_text])[0]
        _, indices = index.search(np.array([doc_embedding]), k=2)
        relevant_regs = [corpus[i] for i in indices[0]]
        client = Client()
        prompt = (
            f"Check if the following document text complies with these ADGM regulations: {relevant_regs}. "
            f"List any non-compliance issues, specifying the exact sentence or phrase, its paragraph number, "
            f"and a suggested fix. If none, say 'No non-compliance issues detected.' Text: {full_text}"
        )
        response = client.generate(model="llama3.1:8b", prompt=prompt)
        llm_response = response["response"]
        print(f"RAG LLM response for {doc_type}: {llm_response[:100]}...")
        if "non-compliance" in llm_response.lower():
            issue_lines = [line for line in llm_response.split('\n') if
                           "non-compliance" in line.lower() and line.strip()]
            for issue_line in issue_lines:
                para_match = re.search(r"paragraph (\d+)", issue_line, re.IGNORECASE)
                if para_match and int(para_match.group(1)) < len(text):
                    para_index = int(para_match.group(1))
                else:
                    para_index = 0
                if "no other non-compliance" in issue_line.lower() or issue_line.strip() == "**Non-compliance issues:**":
                    continue
                issues.append({
                    "document": doc_type,
                    "section": section_map.get(para_index, "Unknown"),
                    "issue": "Non-compliance with ADGM regulations",
                    "severity": "High",
                    "suggestion": issue_line if len(issue_line) < 200 else issue_line[:200] + "..."
                })
    except Exception as e:
        print(f"RAG error for {doc_type}: {e}. Skipping RAG checks.")
        issues.append({
            "document": doc_type,
            "section": section_map.get(0, "Unknown"),
            "issue": "RAG check failed due to LLM connection error",
            "severity": "Low",
            "suggestion": "Ensure Ollama is running and retry."
        })
    return issues

def process_documents(file_paths):
    model, index, corpus = setup_rag()
    uploaded_docs = []
    all_issues = []
    commented_files = []
    for file_path in file_paths:
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        text, doc = read_docx(file_path)
        if text:
            doc_type = classify_document(text, file_path)
            if doc_type in uploaded_docs:
                print(f"Warning: Duplicate document type {doc_type} for {file_path}. Processing anyway.")
            uploaded_docs.append(doc_type)
            issues = check_red_flags(text, doc_type, doc)
            if model and index:
                compliance_issues = check_compliance(text, doc_type, model, index, corpus, doc)
                issues.extend(compliance_issues)
            all_issues.extend(issues)
            if doc:
                commented_path = f"commented_{os.path.basename(file_path)}"
                doc.save(commented_path)
                commented_files.append(commented_path)
    process = detect_process(uploaded_docs)
    missing_docs = check_missing_documents(process, uploaded_docs)
    if process == "Unknown":
        message = "Unable to detect legal process. Please upload relevant ADGM documents."
        required_count = uploaded_count = 0
    else:
        required_count = len(define_checklists().get(process, []))
        uploaded_count = len(set(uploaded_docs))
        if not missing_docs:
            message = f"It appears that you’re trying to {process.lower()}. All required documents ({required_count}) have been uploaded."
        else:
            missing_str = ", ".join(missing_docs)
            message = (f"It appears that you’re trying to {process.lower()}. "
                       f"Based on our reference list, you have uploaded {uploaded_count} out of {required_count} required documents. "
                       f"The missing document(s): {missing_str}.")
    result = {
        "process": process,
        "documents_uploaded": list(set(uploaded_docs)),
        "documents_uploaded_count": len(set(uploaded_docs)),
        "required_documents_count": required_count,
        "missing_documents": missing_docs,
        "issues_found": all_issues,
        "message": message
    }
    with open("compliance_output.json", "w") as f:
        json.dump(result, f, indent=2)
    return result, commented_files

def gradio_interface(files):
    if not files:
        return {"error": "No files uploaded"}, []
    result, commented_files = process_documents(files)
    return result, commented_files

with gr.Blocks() as demo:
    gr.Markdown("# ADGM Corporate Agent Tool")
    gr.Markdown("Upload .docx files to check compliance with ADGM regulations.")
    file_input = gr.File(file_count="multiple", file_types=[".docx"], label="Upload Documents")
    output_json = gr.JSON(label="Compliance Output")
    output_files = gr.File(label="Processed Documents")
    submit_button = gr.Button("Process Documents")
    submit_button.click(fn=gradio_interface, inputs=file_input, outputs=[output_json, output_files])

if __name__ == "__main__":
    sample_files = [
        "adgm-ra-model-articles-private-company-limited-by-shares.docx",
        "adgm-ra-resolution-multiple-incorporate-shareholders-LTD-incorporation-v2.docx"
    ]
    result, commented_files = process_documents(sample_files)
    print("Process:", result["process"])
    print("Uploaded Documents:", result["documents_uploaded"])
    print("Uploaded Count:", result["documents_uploaded_count"])
    print("Required Count:", result["required_documents_count"])
    print("Missing Documents:", result["missing_documents"])
    print("Issues Found:", result["issues_found"])
    print("Message:", result["message"])
    print("Commented Files:", commented_files)
    print("Output saved to compliance_output.json")
    demo.launch()


Detected document type: Articles of Association in articles of association private company limited by...
Error adding comment to paragraph 0: 'DocumentPart' object has no attribute 'comments_part'
LLM response for Articles of Association: Here's the analysis of ambiguous or non-binding language in the given legal document text:

1. **"Di...
Error adding comment to paragraph 2: 'DocumentPart' object has no attribute 'comments_part'
RAG LLM response for Articles of Association: After analyzing the document, I found some non-compliance issues with the ADGM regulations. Here are...
Error adding comment to paragraph 0: 'DocumentPart' object has no attribute 'comments_part'
Detected document type: Shareholder Resolution Templates in resolution of incorporating shareholders of [inser...
LLM response for Shareholder Resolution Templates: After analyzing the text, I've identified two potential issues with ambiguous language:

1. **Approx...
Error adding comment to paragraph 2: 'DocumentPart' ob