In [1]:
import os
import json
import re
import docx
from docx import Document
import docx2txt
from ollama import Client
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

def read_docx(file_path):
    try:
        if file_path.endswith('.docx'):
            doc = Document(file_path)
            text = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
        # elif file_path.endswith('.doc'):
        #     text = docx2txt.process(file_path)
        #     text = [line.strip() for line in text.split('\n') if line.strip() and not line.startswith('_') and 'EMBED' not in line]

        else:
            print(f"Unsupported file format: {file_path}")
            return []
        if not text:
            print(f"No valid text extracted from {file_path}")
            return []
        print(f"Extracted text from {file_path}: {text[:3]}...")  #
        return text
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return []

def classify_document(text, file_path):
    """
    Identify the document type using keyword-based detection.
    Args:
        text (list): List of text strings from the file.
        file_path (str): Path to the file for context.
    Returns:
        str: Detected document type or 'Unknown'.
    """
    doc_types = [
        "Articles of Association",
        "Memorandum of Association",
        "Board Resolution Templates",
        "Shareholder Resolution Templates",
        "Incorporation Application Form",
        "UBO Declaration Form",
        "Register of Members and Directors",
        "Change of Registered Address Notice"
    ]
    full_text = " ".join(text).lower()
    # Prioritize exact matches in title or filename
    if "resolution" in file_path.lower() and "shareholder" in full_text:
        print(f"Detected document type: Shareholder Resolution Templates in {full_text[:50]}...")  # Debug
        return "Shareholder Resolution Templates"
    for doc_type in doc_types:
        if doc_type.lower() in full_text:
            print(f"Detected document type: {doc_type} in {full_text[:50]}...")  # Debug
            return doc_type
    print(f"No document type detected in {full_text[:50]}...")  # Debug
    return "Unknown"

def define_checklists():
    """
    Define ADGM checklists for legal processes.
    Returns:
        dict: Mapping of process to required documents.
    """
    return {
        "Company Incorporation": [
            "Articles of Association",
            "Memorandum of Association",
            "Board Resolution Templates",
            "Shareholder Resolution Templates",
            "Incorporation Application Form",
            "UBO Declaration Form",
            "Register of Members and Directors"
        ]
    }

def detect_process(documents):
    """
    Detect the legal process based on uploaded document types.
    Args:
        documents (list): List of detected document types.
    Returns:
        str: Detected process or 'Unknown'.
    """
    checklists = define_checklists()
    for process, required_docs in checklists.items():
        if any(doc in required_docs for doc in documents):
            print(f"Detected process: {process}")  # Debug
            return process
    print("No process detected")  # Debug
    return "Unknown"

def check_missing_documents(process, uploaded_docs):
    """
    Check for missing documents in the detected process.
    Args:
        process (str): Detected legal process.
        uploaded_docs (list): List of uploaded document types.
    Returns:
        list: List of missing documents.
    """
    checklists = define_checklists()
    required_docs = checklists.get(process, [])
    missing = [doc for doc in required_docs if doc not in uploaded_docs]
    print(f"Missing documents: {missing}")  # Debug
    return missing

def check_red_flags(text, doc_type):
    """
    Detect red flags in document text using regex and LLM.
    Args:
        text (list): List of text strings from the file.
        doc_type (str): Document type.
    Returns:
        list: List of detected issues.
    """
    issues = []
    full_text = " ".join(text)

    # Rule-based check: Incorrect jurisdiction
    if re.search(r"UAE Federal Courts", full_text, re.IGNORECASE):
        issues.append({
            "document": doc_type,
            "section": "Unknown",
            "issue": "Jurisdiction clause does not specify ADGM",
            "severity": "High",
            "suggestion": "Update jurisdiction to ADGM Courts."
        })

    # Rule-based check: Missing governing law
    if doc_type in ["Articles of Association", "Memorandum of Association"] and not re.search(r"governing law|ADGM regulations", full_text, re.IGNORECASE):
        issues.append({
            "document": doc_type,
            "section": "Unknown",
            "issue": "No governing law clause found",
            "severity": "Medium",
            "suggestion": "Add clause specifying ADGM regulations as governing law."
        })

    # LLM-based check: Ambiguous language
    try:
        client = Client()
        prompt = (
            f"Analyze the following legal document text for ambiguous or non-binding language "
            f"(e.g., use of 'may' instead of 'shall'). Provide a brief explanation if found, "
            f"and suggest a fix. If none, say 'No ambiguous language detected.' Text: {full_text[:500]}"
        )
        response = client.generate(model="llama3.1:8b", prompt=prompt)
        llm_response = response["response"]
        print(f"LLM response for {doc_type}: {llm_response[:100]}...")  # Debug
        if "ambiguous" in llm_response.lower() or "non-binding" in llm_response.lower():
            issues.append({
                "document": doc_type,
                "section": "Unknown",
                "issue": "Ambiguous or non-binding language detected",
                "severity": "Medium",
                "suggestion": llm_response if len(llm_response) < 200 else llm_response[:200] + "..."
            })
    except Exception as e:
        print(f"LLM error for {doc_type}: {e}. Skipping LLM checks.")
        issues.append({
            "document": doc_type,
            "section": "Unknown",
            "issue": "LLM check failed due to connection error",
            "severity": "Low",
            "suggestion": "Ensure Ollama is running and retry."
        })

    return issues

def create_adgm_corpus():
    """
    Create a mock ADGM regulation corpus for RAG.
    Returns:
        list: List of regulation texts.
    """
    return [
        "ADGM companies must specify ADGM Courts as the governing jurisdiction.",
        "Articles of Association must include a clause on share capital allocation.",
        "Memorandum of Association requires a clear statement of company objectives under ADGM regulations.",
        "All legal documents must use binding language, e.g., 'shall' instead of 'may'.",
        "Shareholder Resolution Templates must confirm the appointment of directors and authorized signatories."
    ]

def setup_rag():
    """
    Set up RAG with sentence-transformers and faiss.
    Returns:
        tuple: (model, index, corpus)
    """
    try:
        model = SentenceTransformer('all-MiniLM-L6-v2')
        corpus = create_adgm_corpus()
        embeddings = model.encode(corpus)
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)
        return model, index, corpus
    except Exception as e:
        print(f"RAG setup error: {e}. Using regex checks only.")
        return None, None, create_adgm_corpus()

def check_compliance(text, doc_type, model, index, corpus):
    """
    Check document compliance using RAG.
    Args:
        text (list): Document text.
        doc_type (str): Document type.
        model: SentenceTransformer model.
        index: FAISS index.
        corpus: ADGM regulation corpus.
    Returns:
        list: List of compliance issues.
    """
    issues = []
    full_text = " ".join(text)[:500]
    try:
        # Embed document text
        doc_embedding = model.encode([full_text])[0]

        # Retrieve top-2 relevant regulations
        _, indices = index.search(np.array([doc_embedding]), k=2)
        relevant_regs = [corpus[i] for i in indices[0]]

        # Use LLM to check compliance
        client = Client()
        prompt = (
            f"Check if the following document text complies with these ADGM regulations: {relevant_regs}. "
            f"List any non-compliance issues and suggest fixes. Text: {full_text}"
        )
        response = client.generate(model="llama3.1:8b", prompt=prompt)
        llm_response = response["response"]
        print(f"RAG LLM response for {doc_type}: {llm_response[:100]}...")  # Debug
        if "non-compliance" in llm_response.lower():
            issues.append({
                "document": doc_type,
                "section": "Unknown",
                "issue": "Non-compliance with ADGM regulations",
                "severity": "High",
                "suggestion": llm_response if len(llm_response) < 200 else llm_response[:200] + "..."
            })
    except Exception as e:
        print(f"RAG error for {doc_type}: {e}. Skipping RAG checks.")
        issues.append({
            "document": doc_type,
            "section": "Unknown",
            "issue": "RAG check failed due to LLM connection error",
            "severity": "Low",
            "suggestion": "Ensure Ollama is running and retry."
        })

    return issues

def process_documents(file_paths):
    """
    Process multiple .docx/.doc files, verify checklist, detect red flags, and check compliance.
    Args:
        file_paths (list): List of paths to files.
    Returns:
        dict: Contains process, uploaded docs, counts, missing docs, issues, and message.
    """
    # Setup RAG
    model, index, corpus = setup_rag()

    uploaded_docs = []
    all_issues = []
    for file_path in file_paths:
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        text = read_docx(file_path)
        if text:
            doc_type = classify_document(text, file_path)
            # Allow processing even if same doc_type (log warning)
            if doc_type in uploaded_docs:
                print(f"Warning: Duplicate document type {doc_type} for {file_path}. Processing anyway.")
            uploaded_docs.append(doc_type)
            # Red flag detection
            issues = check_red_flags(text, doc_type)
            # Compliance check with RAG
            if model and index:
                compliance_issues = check_compliance(text, doc_type, model, index, corpus)
                issues.extend(compliance_issues)
            all_issues.extend(issues)

    process = detect_process(uploaded_docs)
    missing_docs = check_missing_documents(process, uploaded_docs)

    if process == "Unknown":
        message = "Unable to detect legal process. Please upload relevant ADGM documents."
    else:
        required_count = len(define_checklists().get(process, []))
        uploaded_count = len(set(uploaded_docs))  # Count unique document types
        if not missing_docs:
            message = f"It appears that you’re trying to incorporate a company in ADGM. All required documents ({required_count}) have been uploaded."
        else:
            missing_str = ", ".join(missing_docs)
            message = (f"It appears that you’re trying to incorporate a company in ADGM. "
                       f"Based on our reference list, you have uploaded {uploaded_count} out of {required_count} required documents. "
                       f"The missing document(s): {missing_str}.")

    result = {
        "process": process,
        "documents_uploaded": list(set(uploaded_docs)),  # List unique document types
        "documents_uploaded_count": uploaded_count,
        "required_documents_count": required_count,
        "missing_documents": missing_docs,
        "issues_found": all_issues,
        "message": message
    }

    with open("compliance_output.json", "w") as f:
        json.dump(result, f, indent=2)

    return result

# Test with sample files
sample_files = [
    "adgm-ra-model-articles-private-company-limited-by-shares.docx",
    "adgm-ra-resolution-multiple-incorporate-shareholders-LTD-incorporation-v2.docx"
]
result = process_documents(sample_files)

print("Process:", result["process"])
print("Uploaded Documents:", result["documents_uploaded"])
print("Uploaded Count:", result["documents_uploaded_count"])
print("Required Count:", result["required_documents_count"])
print("Missing Documents:", result["missing_documents"])
print("Issues Found:", result["issues_found"])
print("Message:", result["message"])
print("Output saved to compliance_output.json")


Extracted text from adgm-ra-model-articles-private-company-limited-by-shares.docx: ['ARTICLES OF ASSOCIATION', 'PRIVATE COMPANY LIMITED BY SHARES']...
Detected document type: Articles of Association in articles of association private company limited by...
LLM response for Articles of Association: Here's the analysis of the provided text:

1. **"Defined terms Liability of members"**: This phrase ...
RAG LLM response for Articles of Association: After reviewing the document text, I found the following non-compliance issues with the ADGM regulat...
Extracted text from adgm-ra-resolution-multiple-incorporate-shareholders-LTD-incorporation-v2.docx: ['RESOLUTION OF INCORPORATING SHAREHOLDERS', 'OF']...
Detected document type: Shareholder Resolution Templates in resolution of incorporating shareholders of [inser...
LLM response for Shareholder Resolution Templates: After analyzing the provided text, I have found several instances of ambiguous or non-binding langua...
RAG LLM response for Sha