<a href="https://colab.research.google.com/github/larry-tableau/tableau/blob/main/CrewAI_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt-get update
!pip --quiet install google-api-python-client
!pip --quiet install langchain-openai
!pip --quiet install --upgrade anthropic langchain-anthropic
!pip --quiet install crewai
!pip --quiet install 'crewai[tools]'

In [None]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["ANTHROPIC_API_KEY"] = userdata.get('ANTHROPIC_API_KEY')

In [None]:
from crewai_tools import PDFSearchTool, JSONSearchTool

# Initialize the tool allowing for any PDF content search if the path is provided during execution
tool = PDFSearchTool()
json_tool = JSONSearchTool()

# OR

# Initialize the tool with a specific PDF path for exclusive search within that document
tool = PDFSearchTool(pdf='/content/Designing-Efficient-Workbooks-2021-Interworks_0.pdf')
json_tool = JSONSearchTool(json_path='/content/output-tableau_cloud-1.json')

Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  1.54it/s]
Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  1.84it/s]


In [None]:
!wget -c "https://www.apra.gov.au/sites/default/files/CPG-235-Managing-Data-Risk.pdf"
!wget -c "https://www.tableau.com/sites/default/files/2021-10/Designing-Efficient-Workbooks-2021-Interworks_0.pdf"
!wget -c "https://www.apra.gov.au/sites/default/files/cpg_234_information_security_june_2019_0.pdf"
!wget -c "https://www.apra.gov.au/sites/default/files/CPG-235-Managing-Data-Risk.pdf"
!wget -c "https://www.apra.gov.au/sites/default/files/Prudential-Practice-Guide-CPG-235-Managing-Data-Risk_1.pdf"

In [None]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai_tools import JSONSearchTool, PDFSearchTool, FileReadTool, CSVSearchTool
from pydantic import BaseModel, Field

# llm = LLM(model="anthropic/claude-3-5-sonnet-20240620", temperature=0.1,  seed=42)
llm = LLM(model="gpt-4o", temperature=0, seed=42)

# Initialize tools
dataset = CSVSearchTool()
json_search_tool = JSONSearchTool('/content/output-tableau_cloud-1.json')
pdf_search_tool = PDFSearchTool('/content/Designing-Efficient-Workbooks-2021-Interworks_0.pdf')
cpg_235 = PDFSearchTool('/content/CPG-235-Managing-Data-Risk.pdf')
cpg_235_p = PDFSearchTool('/content/Prudential-Practice-Guide-CPG-235-Managing-Data-Risk_1.pdf')
# dataset = CSVSearchTool('/content/tableau_metadata_output.csv')
dataset = FileReadTool('/content/tableau_metadata_output.csv')

# Agent to extract data from JSON
json_agent = Agent(
    role="Data Extractor",
    goal="Extract relevant information on best practices for performance in Tableau",
    backstory="An expert in data parsing, focused on structured information extraction from files and Vector databases.",
    #verbose=True,
    llm=llm,
    tools=[pdf_search_tool, json_search_tool]
)

compliance_agent = Agent(
    role="Compliance Officer",
    goal="Ensure STRICT compliance to processes, theme's and practices in the referenced documentation without compromise.",
    backstory="A regulatory compliance senior officer ensuring data security practices are strictly enforced.",
    #verbose=True,
    # allow_delegation=True,
    memory=True,
    llm=llm,
    tools=[cpg_235, cpg_235_p, dataset]
)

# Agent to review supporting PDFs
#pdf_agent = Agent(
#    role="Document Reviewer",
#    goal="Analyze PDFs for additional insights related to best practices on performance for Tableau.",
#    backstory="A detailed-oriented document reviewer, skilled at finding supporting insights from PDF files.",
#    verbose=True,
#    llm=llm,
#    tools=[pdf_search_tool]
#)

# Task for JSON data extraction
json_task = Task(
    description="Locate and extract best practices on performance for Tableau.",
    expected_output="A summary of the top best practices for Tableau when creating content.",
    output_file="json_task.md",
    llm=llm,
    agent=json_agent
)

compliance_task = Task (
    description="""
          This task involves a comprehensive risk assessment of calculation entries in the 'formula' column, categorizing each as low, medium, or high risk according to APRA's CPG235 guidelines. The agent will perform the following steps, aligning with the Chain-of-Thought, ReAct, and Train-of-Thought frameworks:
          1. **Analyze**: Systematically examine each formula in the provided dataset to understand its complexity, data dependencies, and operational context. Identify potential information gaps such as missing data controls, validation mechanisms, or formula complexity that could impact data risk.
          2. **Reason**: For each formula, determine its risk profile by assessing against key CPG235 data risk management principles:
            - **Data Quality**: Evaluate whether the formula’s output is impacted by accuracy, completeness, or consistency risks.
            - **Data Lifecycle Management**: Assess whether the calculation supports requirements for data capture, processing, retention, publication, and disposal.
            - **Validation and Auditability**: Check for validation controls and ease of auditability within the formula, considering if manual adjustments or frequent recalculations could introduce inconsistencies.
          3. **Act**: Incorporate CPG235 references to reinforce the evaluation, such as data validation techniques, audit trail requirements, or retention policies. If missing elements are identified, note recommendations for each calculation to comply with CPG235.
          4. **Refine**: Based on insights and any telemetry feedback, revisit and adjust risk classifications to ensure accuracy, particularly if complex formulas are deemed high-risk. Adjust conclusions if additional controls are required, such as validation or periodic checks.
          5. **Conclude**: Summarize the risk classification as low, medium, or high for each formula, citing the relevant CPG235 guidelines. Provide a brief explanation for each rating, indicating why a calculation entry was classified as such. Suggestions should focus on enhancing compliance and minimizing data risk through improved formula structure or controls, with clear CPG235 references.

        """,
    expected_output="""
        For each calculation entry in the 'formula' column, provide a structured assessment with the following components to ensure clarity, compliance, and alignment with CPG235 guidelines:
        1. **Formula**: Include the original formula as it appears in the entry. This ensures clear traceability and enables easy cross-referencing with the data source.
        2. **Risk Rating**:
          - Assign a rating of 'Low', 'Medium', or 'High' to each formula entry.
          - Justify this rating based on specific CPG235 principles and sections, ensuring that each rating directly corresponds to observed factors (e.g., complexity, data dependencies, validation requirements).
        3. **Justification Statement**:
          - Provide a concise statement explaining the rationale for the risk rating.
          - Base this explanation on:
            - **Data Quality**: Whether the formula impacts accuracy, completeness, or consistency of data outputs.
            - **Data Lifecycle Management**: How the formula’s lifecycle is managed, including data retention, disposal, and traceability.
            - **Validation and Auditability**: If and how the formula supports validation and is auditable according to CPG235.
        4. **CPG235 Reference**:
          - Cite relevant sections of CPG235 guidelines (e.g., “CPG235 Data Quality – Section 4.2”) that support the assessment.
          - Ensure that each reference directly supports the rating given and the reasoning behind it.
        5. **Recommendations (Optional)**:
          - Where applicable, provide a brief recommendation to address any identified risks or areas for improvement in line with CPG235 best practices.
          - For example, suggest adding validation controls, improving audit trails, or scheduling periodic reviews if the formula is rated as Medium or High risk.
        6. **Output Format**:
          - Ensure each formula’s assessment is organized in a clear, structured markdown format in `compliance.md` for easy readability. For instance:
    """,
    output_file="compliance.md",
    #verbose=True,
    llm=llm,
    agent=compliance_agent
)

update_csv = Task (
    description="""
        Rewrites the existing CSV file with additional columns that includes the rating (low, medium, high),
        and another column with the brief description as to why.  Ensure the other entries are untouched.
    """,
    expected_output="A UTF-8 formatted CSV file with additional columns based on rating and description as to why",
    output_file="updated.csv",
    #verbose=True,
    llm=llm,
    context=[compliance_task],
    agent=compliance_agent
)

# Task for PDF document review
# pdf_task = Task(
#    description="Find related insights on best practices on performance for Tableau.",
#    expected_output="A summary of supporting insights, strategies, and projections found in PDFs.",
#    output_file="pdf_task.md",
#    verbose=True,
#    agent=pdf_agent
#)

# Define the crew and the process
analytics_crew = Crew(
    # agents=[json_agent, pdf_agent],
    # agents=[json_agent, compliance_agent],
    agents=[compliance_agent],
    # tasks=[json_task, compliance_task, update_csv],
    tasks=[compliance_task],
    memory=True,
    verbose=True,
    process=Process.sequential
)

# Kickoff example with the crew
result = analytics_crew.kickoff(inputs={})
print(result)


In [None]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai_tools import JSONSearchTool, PDFSearchTool, CSVSearchTool, FileReadTool

tableau_help_json="/content/output-tableau_cloud-1.json"
tableau_best_practices_pdf="/content/Designing-Efficient-Workbooks-2021-Interworks_0.pdf"
cpg235_pdf="/content/CPG-235-Managing-Data-Risk.pdf"
cpg235_practice_pdf="/content/Prudential-Practice-Guide-CPG-235-Managing-Data-Risk_1.pdf"
dataset_csv="/content/tableau_metadata_output.csv"

# Initialize tools
#dataset = CSVSearchTool()
tableau_help_tool = JSONSearchTool(tableau_help_json)
best_practices_tool = PDFSearchTool(tableau_best_practices_pdf)
cpg235_tool = PDFSearchTool(cpg235_pdf)
cpg235_practice_tool = PDFSearchTool(cpg235_practice_pdf)
dataset_tool = CSVSearchTool(dataset_csv)
#dataset_tool = FileReadTool(dataset_csv)

Inserting batches in chromadb: 100%|██████████| 21/21 [00:34<00:00,  1.62s/it]
Inserting batches in chromadb: 100%|██████████| 6/6 [00:09<00:00,  1.51s/it]


In [None]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai_tools import JSONSearchTool, PDFSearchTool, CSVSearchTool, FileReadTool

def initialize_risk_assessment_crew(
    tableau_help_json: str,
    tableau_best_practices_pdf: str,
    cpg235_pdf: str,
    cpg235_practice_pdf: str,
    dataset_csv: str
):
    # Initialize LLM
    #llm = LLM(model="anthropic/claude-3-5-sonnet-20240620", temperature=0.1, max_tokens=8192, seed=42)
    llm = LLM(model="gpt-4o", temperature=0.5, seed=42)

    #dataset_tool = FileReadTool(dataset_csv)
    dataset_tool = CSVSearchTool(dataset_csv)

    # Initialize agents
    tableau_expert = Agent(
        role="Tableau Expert",
        goal="Assess Tableau calculations for optimal usage, performance, and compliance, focusing on data accuracy, efficiency, and risk."
             "Provide insights to improve calculation design, ensuring dashboards meet performance and compliance standards.",
        backstory=
           "I am a skilled data visualization specialist focused on ensuring Tableau calculations are accurate, efficient, and compliant."
           "With deep knowledge of Tableau’s functionality, I analyze each calculation’s design and impact on performance, user experience,"
           "and data integrity. Working with the Compliance Officer, I balance performance optimization with risk management"
           "to uphold best practices and compliance.",
        llm=llm,
        cache=True,
        max_rpm=30,
        memory=True,
        tools=[best_practices_tool, dataset_tool]
    )

    compliance_agent = Agent(
        role="Compliance Officer",
        goal="Ensure STRICT compliance with attached guidelines",
        backstory=
            "I am a regulatory compliance expert focused on upholding data governance and risk management."
            "With deep knowledge of the documented guidelines, I evaluate complex calculations,"
            "identify data risks, and recommend compliance improvements."
            "My mission is to meticulously review each formula, verify compliance,"
            "and document findings with transparent references to the documentation, ensuring high standards of regulatory accuracy and reliability.",
        llm=llm,
        #step_callback=True,
        cache=True,
        max_rpm=30,
        memory=True,
        tools=[cpg235_tool, cpg235_practice_tool, dataset_tool]
    )

    # Define tasks
    tableau_analysis_task = Task(
        description=
            "Perform a best practice assessment on each formula in the 'formula' column against Tableau best practices:"
            "1. Check each calculation for proper syntax and function use and reference against Tableau documentation."
            "2. Assess performance impact, referencing performance guidelines from the documentation."
            "3. Suggest Optimisations for performance, simplicity and readability."
            "4. Document findings with citations from the documents referenced guides.",
        expected_output=
            "For each formula:"
              "1. **Formula**: Display the original formula."
              "2. **Best Practice Check**: Confirm alignment to Tableau’s best practices.  Suggest improvements to the formula"
                "- **Citation**: Reference the necessary Tableau sources."
              "3. **Performance Rating**: Rate impact as Low, Medium, or High, with justification."
                "- **Citation**: Detail and reference the relevant section from the Tableau documentation."
              """Ensure output is in structured markdown without special markdown syntax (like "```")""",
        output_file="tableau_analysis.md",
        llm=llm,
        #tools=[dataset_tool],
        agent=tableau_expert
    )

    compliance_task = Task (
        description=
            "Perform a risk assessment on each formula in the 'formula' column based on CPG-235 guidelines."
            "1. **Analyze**: Evaluate each formula’s complexity, data dependencies, and operational context."
            "2. **Reason**: Determine risk (Low, Medium, High) by assessing:"
              "- **Data Quality**: Accuracy, completeness, and consistency."
              "- **Data Lifecycle**: Compliance with data retention, processing, and disposal standards."
              "- **Validation and Auditability**: Presence of validation and ease of auditing."
            "3. **Act**: Reference CPG235 for data validation, audit trails, and lifecycle management."
            "4. **Refine**: Adjust ratings based on telemetry or additional controls needed."
            "5. **Conclude**: Summarize risk ratings with CPG235 references, explaining each rating and noting recommendations for risk reduction.",
        expected_output=
            "For each formula, provide:"
            "1. **Formula**: Display the original formula."
            "2. **Risk Rating**: Low, Medium, or High, justified with CPG235 criteria (e.g., complexity, validation)."
            "3. **Justification**: Brief explanation based on:"
              "- **Data Quality**"
              "- **Data Lifecycle Management**"
              "- **Validation and Auditability**"
            "4. **CPG235 Reference**: Cite relevant sections supporting the rating."
            "5. **Recommendations** (if applicable): Brief suggestions for risk mitigation."
            """Output in structured markdown without "```" or markdown indicators.""",
        output_file="compliance.md",
        #output_pydantic="compliance_pydantic.md",
        #verbose=True,
        llm=llm,
        #tools=[dataset_tool],
        # context=[tableau_analysis_task],
        agent=compliance_agent
    )

    # Create crew
    crew = Crew(
        # agents=[tableau_expert, compliance_agent],
        # tasks=[tableau_analysis_task, compliance_task, update_csv_task],
        agents=[tableau_expert, compliance_agent],
        tasks=[tableau_analysis_task, compliance_task],
        process=Process.hierarchical,
        #respect_context_window=True,
        manager_agent=None,
        planning=True,
        memory=True,
        manager_llm=llm,
        verbose=True
    )

    return crew

def assess_tableau_calculations(
    tableau_help_json: str,
    tableau_best_practices_pdf: str,
    cpg235_pdf: str,
    cpg235_practice_pdf: str,
    dataset_csv: str
) -> dict:
    """Main function to assess Tableau calculations"""

    # Initialize crew
    crew = initialize_risk_assessment_crew(
        tableau_help_json,
        tableau_best_practices_pdf,
        cpg235_pdf,
        cpg235_practice_pdf,
        dataset_csv
    )

    # Run assessment
    results = crew.kickoff()

    return results

# Example usage
if __name__ == "__main__":
    results = assess_tableau_calculations(
        tableau_help_json="/content/output-tableau_cloud-1.json",
        tableau_best_practices_pdf="/content/Designing-Efficient-Workbooks-2021-Interworks_0.pdf",
        cpg235_pdf="/content/CPG-235-Managing-Data-Risk.pdf",
        cpg235_practice_pdf="/content/Prudential-Practice-Guide-CPG-235-Managing-Data-Risk_1.pdf",
        dataset_csv="/content/tableau_metadata_output.csv"
    )

    print(results)