In [1]:
from crewai import Agent, Task, Crew
from langchain_community.chat_models import ChatCohere
from langchain.document_loaders import PyPDFLoader
import os

In [2]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Initialize language model from Cohere
os.environ["COHERE_API_KEY"] = os.getenv('COHERE_API_KEY')
llm = ChatCohere()

# Agents

In [4]:
# Agent 1: Extractor - Reads and extracts text from the PDF
extractor_agent = Agent(
    role="PDF Extractor",
    goal="Extract text from contracts in PDF format",
    backstory="An expert in document processing and OCR technologies",
    llm=llm
)

# Agent 2: Contract Analyst - Identifies key elements from the contract
contract_analyst_agent = Agent(
    role="Contract Analyst",
    goal="Extract key elements from the contract text",
    backstory="A legal expert with deep knowledge of contract terms and conditions",
    llm=llm
)

# Agent 3: Output Formatter - Formats extracted data into structured output
formatter_agent = Agent(
    role="Structured Output Formatter",
    goal="Format extracted contract elements into a JSON output",
    backstory="A skilled data processor who ensures structured contract analysis",
    llm=llm
)

# Tasks

In [9]:
extract_text_task = Task(
    description="Extract all text from the given contract PDF and return as plain text.",
    agent=extractor_agent,
    expected_output="A string containing the full text of the contract in a human-readable format."
)

analyze_contract_task = Task(
    description="""Analyze the extracted contract text and extract the following key elements:
    - Document Name
    - Parties
    - Agreement Date
    - Effective Date
    - Expiration Date
    - Renewal Term
    - Notice to Terminate Renewal
    - Governing Law
    - Most Favored Nation
    
    Provide the output as a structured JSON object with each category as a key and the corresponding extracted value.
    """,
    agent=contract_analyst_agent,
    expected_output="""A structured JSON object, for example:
    {
        "Document Name": "Master Service Agreement - Company X & Company Y",
        "Parties": ["Company X", "Company Y"],
        "Agreement Date": "01/15/2023",
        "Effective Date": "02/01/2023",
        "Expiration Date": "02/01/2028",
        "Renewal Term": "Successive 1-year terms",
        "Notice to Terminate Renewal": "90 days prior notice",
        "Governing Law": "California, USA",
        "Most Favored Nation": "Yes"
    }
    """
)

format_output_task = Task(
    description="Format the extracted contract elements into a clean, structured JSON output that is human-readable and properly formatted.",
    agent=formatter_agent,
    expected_output="""A formatted JSON string ensuring proper indentation and readability, for example:
    {
        "Document Name": "Master Service Agreement - Company X & Company Y",
        "Parties": [
            "Company X",
            "Company Y"
        ],
        "Agreement Date": "01/15/2023",
        "Effective Date": "02/01/2023",
        "Expiration Date": "02/01/2028",
        "Renewal Term": "Successive 1-year terms",
        "Notice to Terminate Renewal": "90 days prior notice",
        "Governing Law": "California, USA",
        "Most Favored Nation": "Yes"
    }
    """
)

# Assemble the Crew

In [15]:
crew = Crew(
    agents=[extractor_agent, contract_analyst_agent, formatter_agent],
    tasks=[extract_text_task, analyze_contract_task, format_output_task]
)



# Run It

In [16]:
# Function to process the contract PDF
def process_contract(pdf_path):
    # Load and extract text from the PDF
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    contract_text = "\n".join([doc.page_content for doc in documents])

    # Ensure we pass the extracted text properly
    result = crew.kickoff(inputs={"contract_text": contract_text})  # Pass as a dictionary

    return result

In [17]:
# Example usage: Replace 'contract.pdf' with your actual contract file
pdf_path = "CUAD_v1/full_contract_pdf/Part_I/Maintenance/AtnInternationalInc_20191108_10-Q_EX-10.1_11878541_EX-10.1_Maintenance Agreement.pdf"  # Provide your contract PDF path
output = process_contract(pdf_path)

KeyError: '\n        "Document Name"'

In [None]:
# Print and save the extracted contract details
print(json.dumps(output, indent=4))
with open("extracted_contract.json", "w") as f:
    json.dump(output, f, indent=4)