In [2]:
import docx
import os
os.chdir("../../")

def parse_contract(file_path, chunk_size=1000):
  """
  Parses a .docx file, preserving section numbering, bullet points, and chunking the text.
  This version aims to be more generic, handling different document styles and numbering schemes.

  Args:
    file_path (str): The path to the .docx file.
    chunk_size (int): The desired maximum size of text chunks (in characters).

  Returns:
    list: A list of dictionaries, each representing a section with content and numbering.
  """
  doc = docx.Document(file_path)
  sections = []
  current_section = {"numbering": "", "content": ""}
  current_text = ""

  for para in doc.paragraphs:
    # Check if paragraph is a heading
    if para.style.name.startswith('Heading'):
      if current_section["content"]:
        sections.append(current_section)
        current_section = {"numbering": "", "content": ""}
      current_section["numbering"] = para.text
    else:
      # Handle list numbering and bullet points
      prefix = ""
      if para.style.name.startswith('List'):
        # This is a simplified approach; you might need more complex logic here
        prefix = "* " if para.style.name.startswith('ListBullet') else "# "
      
      current_text += prefix + para.text + "\n"

      # Chunk if text size exceeds limit
      if len(current_text) >= chunk_size:
        if current_section["content"]:
          sections.append(current_section)
          current_section = {"numbering": "", "content": ""}
        current_section["content"] = current_text
        current_text = ""

  # Append the last section if it has content
  if current_section["content"]:
    sections.append(current_section)

  return sections

# Example usage
contract_file = "data/content/Robinson Advisory.docx"
parsed_sections = parse_contract(contract_file)

for section in parsed_sections:
  print(section["numbering"])
  # Further process or analyze section["content"] as needed













In [2]:
import docx

def parse_contract(file_path):
    """
    Parses a .docx legal contract, preserving all numbering, styles, and formatting.

    Args:
        file_path (str): The path to the .docx file.

    Returns:
        list: A list of dictionaries, each representing a paragraph with its content, style, and numbering info.
    """
    doc = docx.Document(file_path)
    paragraphs = []
    
    for para in doc.paragraphs:
        paragraph_info = {
            "text": para.text,
            "style": para.style.name
        }

        # Extract numbering information (if present)
        if para.paragraph_format.list_format:
            paragraph_info["numbering"] = {
                "level": para.paragraph_format.list_format.list_level,
                "num_id": para.paragraph_format.list_format.num_id,
                "style": para.paragraph_format.list_format.list_definition.name  # Get numbering style name
            }
        else:
            paragraph_info["numbering"] = None

        paragraphs.append(paragraph_info)
    
    return paragraphs

# Example usage
contract_file = "data/content/Robinson Advisory.docx"
parsed_paragraphs = parse_contract(contract_file)

# Print the parsed paragraphs
for para in parsed_paragraphs:
    if para["numbering"]:
        print(f"Numbering Level: {para['numbering']['level']}, Style: {para['numbering']['style']}, Text: {para['text']}")
    else:
        print(f"Style: {para['style']}, Text: {para['text']}")


AttributeError: 'ParagraphFormat' object has no attribute 'list_format'

#### Using langchain to parse docx

In [2]:
%pip install --upgrade --quiet  docx2txt

Note: you may need to restart the kernel to use updated packages.


In [1]:
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter  
import docx
import os
os.chdir("../../")

def parse_contract_with_langchain(file_path, chunk_size=1000, chunk_overlap=20):
    """
    Parses a .docx legal contract using LangChain, handling generic structures.

    Args:
        file_path (str): The path to the .docx file.
        chunk_size (int): The desired maximum size of text chunks (in characters).
        chunk_overlap (int): The number of overlapping characters between chunks.

    Returns:
        list: A list of dictionaries, each representing a text chunk with its page content.
    """
    loader = Docx2txtLoader(file_path)
    documents = loader.load()  # Load the document and extract text
    
    # Split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    split_docs = text_splitter.split_documents(documents)
    
    # Extract relevant information from each chunk
    parsed_sections = []
    for doc in split_docs:
        # Use regular expression if you need to extract specific patterns
        # e.g., section_pattern = r"((Section|Clause|Sub-clause|Para)\s+\d+(\.\d+)*)"
        # match = re.search(section_pattern, doc.page_content)

        parsed_sections.append({
            # "numbering": match.group(0) if match else None,  # Add numbering if needed
            "page_content": doc.page_content
        })
    
    return parsed_sections


# Example usage
contract_file = "data/content/Robinson Advisory.docx"
parsed_sections = parse_contract_with_langchain(contract_file)

for section in parsed_sections:
    #print(section.get("numbering", "")) 
    print(section["page_content"])


- 2-



ADVISORY SERVICES AGREEMENT



This Advisory Services Agreement is entered into as of June 15th, 2023 (the “Effective Date”), by and between Cloud Investments Ltd., ID 51-426526-3, an Israeli company (the "Company"), and Mr. Jack Robinson, Passport Number 780055578, residing at 1 Rabin st, Tel Aviv, Israel, Email: jackrobinson@gmail.com ("Advisor").



Whereas,	Advisor has expertise and/or knowledge and/or relationships, which are relevant to the Company’s business and the Company has asked Advisor to provide it with certain Advisory services, as described in this Agreement; and

Whereas, 	Advisor has agreed to provide the Company with such services, subject to the terms set forth in this Agreement.



NOW THEREFORE THE PARTIES AGREE AS FOLLOWS:



Services:
Services:  

Advisor shall provide to the Company, as an independent contractor, software development services, and / or any other services as agreed by the parties from time to time (the “Services”). Advisor shall not appo