# Project: Build a Project status report using RAG system

## Install OpenAI, and LangChain dependencies

In [2]:
!pip install langchain
!pip install langchain-openai
!pip install langchain-community
!pip install langchain-chroma
import yaml
import os
import openai


Collecting langchain-openai
  Downloading langchain_openai-0.3.27-py3-none-any.whl.metadata (2.3 kB)
Downloading langchain_openai-0.3.27-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.4/70.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-openai
Successfully installed langchain-openai-0.3.27
Collecting langchain-community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downlo

## Setup Environment Variables

In [3]:

with open('/content/sample_data/chatgpt_api_credentials.yml', 'r') as file:
    api_creds = yaml.safe_load(file)
    os.environ['OPENAI_API_KEY'] = api_creds['openai_key']


### Open AI Embedding Models

LangChain enables us to access Open AI embedding models which include the newest models: a smaller and highly efficient `text-embedding-3-small` model, and a larger and more powerful `text-embedding-3-large` model.

In [4]:

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings
# details here: https://openai.com/blog/new-embedding-models-and-api-updates
openai_embed_model = OpenAIEmbeddings(model='text-embedding-3-large')

### Get the PDF data

In [5]:
!pip install PyMuPDF
import os
import fitz  # PyMuPDF
import re
from langchain.schema import Document
from typing import List

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract all text from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

def chunk_text_by_section(text: str, source_file: str) -> List[Document]:
    """
    Chunk text into LangChain Documents based on numbered section headings.
    Matches: 1., 2.1, 3.1.2 etc.
    """
    pattern = r"(?m)^(\d+(?:\.\d+)*\.)\s+(.*)"
    matches = list(re.finditer(pattern, text, re.MULTILINE))

    documents = []
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        section_number = match.group(1).strip()
        section_title = match.group(2).strip()
        section_text = text[start:end].strip()

        documents.append(Document(
            page_content=section_text,
            metadata={
                "section_title": section_title,
                "source_file": os.path.basename(source_file)
            }
        ))
    return documents

def process_pdfs(pdf_paths: List[str]) -> List[Document]:
    """Processes multiple PDFs and returns LangChain Documents per section."""
    all_docs = []
    for path in pdf_paths:
        print(f"📄 Processing: {os.path.basename(path)}")
        text = extract_text_from_pdf(path)
        docs = chunk_text_by_section(text, source_file=path)
        print(f"   ➤ {len(docs)} sections extracted")
        all_docs.extend(docs)
    return all_docs


pdf_files = ["/content/sample_data/Business_Requirement_Document.pdf","/content/sample_data/Solution_Design_Document.pdf","/content/sample_data/System_Design_Document.pdf","/content/sample_data/Test_Case_Document_v1.pdf"]

docs = process_pdfs(pdf_files)

# Step 3: Display some chunks
for i, doc in enumerate(docs[:5]):
    print(f"\n--- Document {i+1} ---")
    print(f"File: {doc.metadata['source_file']}")
    print(f"Section: {doc.metadata['section_title']}")
    print(doc.page_content[:3000])


Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m104.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.1
📄 Processing: Business_Requirement_Document.pdf
   ➤ 8 sections extracted
📄 Processing: Solution_Design_Document.pdf
   ➤ 24 sections extracted
📄 Processing: System_Design_Document.pdf
   ➤ 10 sections extracted
📄 Processing: Test_Case_Document_v1.pdf
   ➤ 5 sections extracted

--- Document 1 ---
File: Business_Requirement_Document.pdf
Section: Executive Summary:
1. Executive Summary:
The purpose of this project is to reduce high operational costs incurred from helpdesk calls, each costing approximately
AUD 20 for 10 minutes. With over 500 calls per weekday and 100 on weekends, this results in significant expenditu

In [None]:
print(docs)



In [5]:
len(docs)

47

### Create LangChain Documents

In [6]:
from langchain.docstore.document import Document

docs = [Document(page_content=doc.page_content,
                 metadata=doc.metadata) for doc in docs]


### Split larger documents into smaller chunks

### Create a Vector DB and persist on disk

Here we initialize a connection to a Chroma vector DB client, and also we want to save to disk, so we simply initialize the Chroma client and pass the directory where we want the data to be saved to.

In [7]:
!pip install langchain-chroma
from langchain_chroma import Chroma

# create vector DB of docs and embeddings - takes < 30s on Colab
chroma_db = Chroma.from_documents(documents=docs,
                                  collection_name='rag_projectstatus_db',
                                  embedding=openai_embed_model,
                                  # need to set the distance function to cosine else it uses euclidean by default
                                  # check https://docs.trychroma.com/guides#changing-the-distance-function
                                  collection_metadata={"hnsw:space": "cosine"},
                                  persist_directory="./projectstatus_db")



ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


### Load Vector DB from disk

This is just to show once you have a vector database on disk you can just load and create a connection to it anytime

In [8]:
# load from disk
chroma_db = Chroma(persist_directory="./projectstatus_db",
                   collection_name='rag_projectstatus_db',
                   embedding_function=openai_embed_model)

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [None]:
chroma_db

## Load Connection to LLM

Here we create a connection to ChatGPT to use later in our chains

In [9]:
from langchain_openai import ChatOpenAI

chatgpt = ChatOpenAI(model_name='gpt-4.1', temperature=0)

## Chained Retrieval Pipeline

This strategy uses a chain of multiple retrievers sequentially to get to the most relevant documents. The following is the flow

Similarity Retrieval → Compression Filter → Reranker Model Retrieval

![](https://i.imgur.com/KriNRDJ.gif)


In [10]:
!pip install sentence_transformers
import sentence_transformers
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers import ContextualCompressionRetriever

# Retriever 1 - simple cosine distance based retriever
similarity_retriever = chroma_db.as_retriever(search_type="similarity",
                                              search_kwargs={"k": 5})

#  decides which of the initially retrieved documents to filter out and which ones to return
_filter = LLMChainFilter.from_llm(llm=chatgpt)
# Retriever 2 - retrieves the documents similar to query and then applies the filter
compressor_retriever = ContextualCompressionRetriever(
    base_compressor=_filter, base_retriever=similarity_retriever
)

# download an open-source reranker model - BAAI/bge-reranker-v2-m3
reranker = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-large")
reranker_compressor = CrossEncoderReranker(model=reranker, top_n=3)
# Retriever 3 - Uses a Reranker model to rerank retrieval results from the previous retriever
final_retriever = ContextualCompressionRetriever(
    base_compressor=reranker_compressor, base_retriever=compressor_retriever
)

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence_transformers)
 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [None]:
query = "What is the purpose of of this solution"
docs = final_retriever.invoke(query)
docs

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


[Document(id='70307dd2-a743-4170-9b85-1faa8d3d9f2f', metadata={'source_file': 'Business_Requirement_Document.pdf', 'section_title': 'Executive Summary:'}, page_content='1. Executive Summary:\nThe purpose of this project is to reduce high operational costs incurred from helpdesk calls, each costing approximately\nAUD 20 for 10 minutes. With over 500 calls per weekday and 100 on weekends, this results in significant expenditure.\nThe proposed solution is an AI-powered chatbot that will address customer queries using internal data and documents,\nreducing the volume of calls to human agents.'),
 Document(id='4464fb66-e7a5-4139-a8e9-27d203278e01', metadata={'section_title': 'Purpose of the Document', 'source_file': 'Solution_Design_Document.pdf'}, page_content='1. Purpose of the Document \nThis document provides a comprehensive solution design for implementing an AI-driven \ncustomer support chatbot using Retrieval-Augmented Generation (RAG), Azure \nOpenAI, Hugging Face models, and Azure-

## Build a QA RAG Chain

To build a RAG chain we need a prompt template which instructs the LLM to not answer questions beyond the scope of the retrieved context documents, there are various such prompts out there, we craft one ourselves below

In [11]:
from langchain_core.prompts import ChatPromptTemplate

prompt = """
You are a Tech Lead and Project Evaluator for a RAG-based AI chatbot solution implementation.

You are responsible for reviewing the following documents:
1. Business Requirement Document (BRD)
2. System Design Document
3. Solution Design Document
4. Test Case Document

Follow these instructions strictly:
- Do NOT make up or assume any information.
- Base all answers ONLY on the provided context.
- Follow industry best practices in architecture, security, testing, and delivery.

Your evaluation tasks include:

1. **✅ Business Requirement Validation**
   - Check whether the solution meets all the business goals and in-scope requirements.
   - Highlight any missing or partially fulfilled requirements.

2. **🏗️ System and Solution Design Evaluation**
   - Review architecture components such as Azure OpenAI, Hugging Face, Chroma, ADLS, Synapse, and Power BI.
   - Validate the design against enterprise-grade cloud best practices (modularity, scalability, fault tolerance).
   - Suggest improvements where applicable.

3. **🔒 Security and Access Control Review**
   - Confirm encryption of data at rest and in transit.
   - Validate role-based access (RBAC) through Azure AD or similar controls.
   - Check for any potential data leakage or non-compliance with data protection standards.

4. **🧪 Testing Validation**
   - Review the test case document for coverage across functional, integration, and non-functional (e.g., load, security) testing.
   - Identify any gaps in testing or missing test scenarios.
   - Ensure test outcomes align with business readiness.

5. **🚦 Performance and Scalability Assessment**
   - Evaluate if the system is capable of supporting high user load (e.g., 500+ users/day).
   - Confirm autoscaling, monitoring, and performance optimization strategies.
   - Identify any risks or bottlenecks that may affect production readiness.

6. **📋 Final Evaluation Report**
   Return a well-structured evaluation with the following sections:
   - ✔️ Summary of what is implemented well
   - ⚠️ Gaps, risks, or non-conformities
   - 💡 Recommendations for improvement
   - ✅ Final readiness score or Go/No-Go assessment

---

Question:
{question}

Context:
{context}

Answer:
         """

prompt_template = ChatPromptTemplate.from_template(prompt)

## LCEL Syntax for QA RAG Chain - Recommended

Here we show you how to create the RAG chain using LangChain's recommended LCEL

In [12]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_rag_chain = (
    {
        "context": (final_retriever
                      |
                    format_docs),
        "question": RunnablePassthrough()
    }
      |
    prompt_template
      |
    chatgpt
)

In [13]:
from IPython.display import Markdown, display

query = "Please evaluate the implementation against business, design, security, performance, and test requirements."
result = qa_rag_chain.invoke(query)
display(Markdown(result.content))


ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


**📋 Final Evaluation Report: RAG-based AI Chatbot Solution**

---

### ✔️ Summary of What Is Implemented Well

- **Business Requirement Fulfillment:**  
  All core business and functional requirements, as outlined in the BRD, have been validated through comprehensive test cases. The solution demonstrates successful handling of user queries, integration with enterprise data sources, and delivery of relevant responses.

- **System & Solution Design:**  
  The architecture leverages enterprise-grade components:  
  - **Azure OpenAI** for LLM capabilities  
  - **Hugging Face** for NLP tasks  
  - **Chroma** for vector storage  
  - **ADLS** for scalable data storage  
  - **Synapse** for analytics  
  - **Power BI** for reporting  
  The design supports modularity, scalability, and fault tolerance, as evidenced by successful failover and load testing.

- **Security & Access Control:**  
  - Data at rest and in transit is protected (as per test results and design best practices).
  - Role-based access is enforced, with Azure AD integration for authentication and authorization.
  - Security test cases confirm access is denied to unauthorized users.

- **Testing & Quality Assurance:**  
  - 15 functional and 6 non-functional test cases executed, all passed.
  - Non-functional tests cover load, stress, availability, security, and failover.
  - No blockers or failed tests; system is stable and reliable.

- **Performance & Scalability:**  
  - System supports 500+ concurrent users with response times <5s.
  - Autoscaling and monitoring are implied by successful load and stress tests.
  - 99.9% availability target met over a 7-day period.

---

### ⚠️ Gaps, Risks, or Non-Conformities

- **Test Coverage Details:**  
  - The summary does not specify coverage of integration points (e.g., between Chroma, Synapse, and Power BI) or edge cases (e.g., data corruption, partial service outages).
  - No explicit mention of data encryption implementation details (e.g., key management, encryption standards).

- **Monitoring & Alerting:**  
  - While high availability and failover are tested, there is no explicit mention of real-time monitoring, alerting, or incident response processes.

- **Compliance & Data Protection:**  
  - No explicit reference to compliance with data protection standards (e.g., GDPR, HIPAA) or data residency requirements.

- **Performance Optimization:**  
  - No details on caching, query optimization, or cost management strategies for high-load scenarios.

---

### 💡 Recommendations for Improvement

1. **Expand Test Documentation:**  
   - Include detailed integration and edge-case scenarios in the test case document.
   - Add tests for data corruption, partial service failures, and recovery procedures.

2. **Enhance Security Documentation:**  
   - Document encryption standards (e.g., AES-256), key management, and audit logging.
   - Include regular security reviews and penetration testing in the release cycle.

3. **Strengthen Monitoring & Operations:**  
   - Implement and document real-time monitoring, alerting, and automated incident response.
   - Define SLAs and SLOs for all critical components.

4. **Compliance Review:**  
   - Conduct a compliance assessment for relevant data protection regulations.
   - Document data residency and retention policies.

5. **Performance & Cost Optimization:**  
   - Review and optimize resource usage (e.g., autoscaling rules, query performance).
   - Implement caching where appropriate to reduce latency and cost.

---

### ✅ Final Readiness Score & Go/No-Go Assessment

**Readiness Score:** 9/10  
**Go/No-Go:** **GO** (Production-ready, with minor documentation and operational enhancements recommended)

**Rationale:**  
The solution meets all core business, functional, and non-functional requirements, with robust architecture and security controls. Minor improvements in documentation, monitoring, and compliance will further strengthen production readiness and operational resilience.

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = """
You are a Solution Architect and Technical Reviewer.

Your task is to review the **System Design Document** of an AI-based solution implementation. The system uses cloud-native architecture and integrates various AI, data, and analytics components.

Please evaluate the following aspects:

🏗️ System and Solution Design Evaluation**
   - Review architecture components such as Azure OpenAI, Hugging Face, Chroma, ADLS, Synapse, and Power BI.
   - Validate the design against enterprise-grade cloud best practices (modularity, scalability, fault tolerance).
   - Suggest improvements where applicable.


Do not fabricate details. Base your review **strictly** on the provided system design documentation.

---

System Design Context:
{context}

Answer:
"""


prompt_template_1 = ChatPromptTemplate.from_template(prompt)

In [None]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_rag_chain = (
    {
        "context": (final_retriever
                      |
                    format_docs),
        "question": RunnablePassthrough()
    }
      |
    prompt_template_1
      |
    chatgpt
)

In [None]:
from IPython.display import Markdown, display

query = "Does the technology stack used for chatbot solution is aligned correctly?"
result = qa_rag_chain.invoke(query)
display(Markdown(result.content))

Certainly! Here is a structured review of the provided **System Design Document** based strictly on the information given:

---

## 1. ✅ Technology Stack Validation

### Appropriateness of Stack

- **Azure OpenAI**: Well-suited for natural language understanding and generative responses. Using a managed service ensures scalability, security, and compliance.
- **Hugging Face RAG**: Appropriate for Retrieval-Augmented Generation, especially for leveraging internal/company-specific documents. This complements Azure OpenAI for knowledge-grounded responses.
- **Gemini Embeddings**: Using a dedicated embedding model (presumably Google Gemini) for vectorization is a modern approach, though cross-cloud integration (if Gemini is not natively on Azure) may introduce complexity.
- **Chroma Vector DB**: A fit-for-purpose, open-source vector database for similarity search. It is lightweight and integrates well with RAG pipelines.
- **Azure Data Lake Storage (ADLS)**: Industry standard for storing raw/unstructured data, logs, and audit trails. Scalable and secure.
- **Azure Synapse Analytics**: Suitable for large-scale analytics, data warehousing, and reporting on chatbot usage and KPIs.
- **Power BI**: Natural choice for dashboarding and business intelligence, especially within the Azure ecosystem.
- **Azure Functions / AKS**: Both are valid for hosting orchestration logic. Azure Functions is serverless and cost-effective for event-driven workloads; AKS is better for complex, containerized, or stateful orchestration.
- **Azure API Management Gateway**: Ensures secure, scalable, and manageable API exposure.
- **Azure Active Directory (AAD)**: Industry standard for authentication and RBAC in enterprise Azure environments.

### Alignment with Cloud-Native Patterns

- **Scalability**: Use of managed services (OpenAI, ADLS, Synapse, API Gateway, Functions/AKS) supports elastic scaling.
- **Security**: AAD and API Gateway provide robust security and access control.
- **Cost-Effectiveness**: Serverless and managed services (Functions, ADLS, Synapse) help optimize costs.
- **Modernity**: The architecture leverages state-of-the-art AI, vector search, and analytics components.

### Justification of Each Component

Each component is clearly mapped to a specific role in the architecture, and their selection is justified based on their intended function.

---

## 2. 💡 Suggestions for Improvement

### Technology Alternatives

- **Embeddings Consistency**: If Gemini Embeddings are not natively supported on Azure, consider using Azure OpenAI’s own embedding models for tighter integration, lower latency, and simplified security/compliance.
- **Vector Database**: While Chroma is lightweight and open-source, for enterprise-grade workloads, consider **Azure Cognitive Search** (with vector search) or **Pinecone** (if available) for better integration, managed scaling, and support.
- **Observability**: Explicitly include **Azure Monitor**, **Application Insights**, or **Log Analytics** for end-to-end monitoring, alerting, and diagnostics.
- **Resilience**: Consider using **Azure Front Door** or **Traffic Manager** for global load balancing and high availability, especially if the solution is customer-facing.
- **Data Governance**: If handling sensitive data, integrate **Azure Purview** for data cataloging, lineage, and compliance.

### Architectural Enhancements

- **Fallback Handling**: Implement fallback logic in the orchestrator for cases where either OpenAI or RAG fails (e.g., timeouts, errors).
- **Caching Layer**: Add a caching mechanism (e.g., Azure Cache for Redis) for frequently asked questions or repeated queries to reduce latency and cost.
- **CI/CD**: Ensure DevOps pipelines (e.g., Azure DevOps, GitHub Actions) are in place for automated deployment and testing.
- **API Rate Limiting**: Ensure API Gateway is configured for rate limiting and abuse prevention.

---

## 3. ⚠️ Highlight Gaps or Risks

### Potential Risks

- **Cross-Cloud Integration**: If Gemini Embeddings are not available on Azure, integrating with Google Cloud may introduce latency, security, and compliance risks.
- **Chroma DB Maturity**: Chroma is relatively new and may lack some enterprise features (e.g., backup, multi-region replication, managed scaling) compared to more mature vector DBs.
- **Orchestration Complexity**: If using both Azure Functions and AKS, ensure clear separation of concerns and avoid unnecessary complexity.
- **Data Security**: Ensure all data in transit and at rest is encrypted, especially embeddings and user interaction logs.

### Missing Components

- **Monitoring & Observability**: No mention of centralized logging, monitoring, or alerting.
- **Disaster Recovery**: No explicit mention of backup, redundancy, or failover strategies for critical components (e.g., Chroma DB, ADLS).
- **Data Privacy & Compliance**: No mention of GDPR, CCPA, or other regulatory compliance if handling PII.
- **Testing & Validation**: No mention of automated testing, model validation, or A/B testing for chatbot responses.

---

## **Summary Table**

| Area                | Strengths                                                                 | Suggestions / Gaps                                                                                 |
|---------------------|---------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------|
| Technology Stack    | Modern, cloud-native, well-justified, scalable, secure                    | Consider Azure-native embeddings/vector DB; ensure Chroma meets enterprise needs                   |
| Architecture        | Clear separation of concerns, managed services, RBAC, analytics            | Add observability, fallback, caching, and CI/CD                                                    |
| Security            | AAD, API Gateway, RBAC                                                    | Explicit encryption, data governance, compliance                                                   |
| Resilience          | Managed services, scalable components                                      | Add DR, redundancy, global load balancing                                                          |
| Observability       | -                                                                         | Add Azure Monitor, App Insights, Log Analytics                                                     |
| Integration         | API Gateway, orchestrator, modular design                                 | Watch for cross-cloud complexity (Gemini), ensure robust error handling                            |

---

## **Conclusion**

The proposed architecture is modern, cloud-native, and leverages appropriate Azure and open-source components for an AI-driven chatbot solution. However, to ensure enterprise readiness, consider addressing observability, resilience, and cross-cloud integration risks, and evaluate Azure-native alternatives for embeddings and vector search. Add explicit monitoring, fallback, and compliance mechanisms to strengthen the overall design.

In [None]:
query = "From the system design document review and validate each technical component used in this solution with Azure Techstack and share the report"
result = qa_rag_chain.invoke(query)
display(Markdown(result.content))

Certainly! Here is a structured review of the provided **System Design Document** based strictly on the information given:

---

## 1. ✅ Technology Stack Validation

### Appropriateness of Stack

- **Azure OpenAI (GPT-4 / GPT-4o):**  
  Well-suited for natural language understanding and generative responses. Using Azure’s managed service ensures enterprise-grade security and scalability.

- **Hugging Face RAG:**  
  Retrieval-Augmented Generation is a strong choice for knowledge-grounded responses. However, the document does not clarify if this is a managed Hugging Face endpoint or a self-hosted model, which affects operational complexity.

- **Gemini Embeddings:**  
  Using Gemini (presumably Google’s embedding model) is unconventional in an Azure-centric stack. Interoperability and data movement between clouds may introduce latency, cost, and compliance concerns.

- **Chroma Vector DB:**  
  Chroma is a modern, open-source vector database, suitable for similarity search. However, it is not a managed Azure service, so operational overhead (scaling, backup, security) must be considered.

- **Azure Data Lake Storage (ADLS):**  
  Industry-standard for storing unstructured data, logs, and documents. Well-integrated with other Azure analytics services.

- **Azure Synapse Analytics:**  
  Appropriate for large-scale data processing and analytics, especially for chatbot usage and KPI tracking.

- **Power BI:**  
  Native Azure dashboarding tool, ideal for business intelligence and reporting.

- **Azure Functions / AKS:**  
  Both are valid for orchestration. Azure Functions is serverless and cost-effective for event-driven workloads; AKS is better for complex, containerized, or long-running processes.

- **Azure Active Directory (AAD):**  
  Standard for authentication and access control in enterprise Azure environments.

### Alignment with Cloud-Native Patterns

- The architecture leverages managed services (OpenAI, ADLS, Synapse, Power BI, AAD), which is in line with cloud-native, scalable, and secure design.
- Use of Chroma and Gemini Embeddings introduces non-Azure components, which may complicate integration, monitoring, and compliance.
- Orchestration via Azure Functions/AKS is flexible and scalable.

---

## 2. 💡 Suggestions for Improvement

### Technology Alternatives

- **Embeddings:**  
  Consider using Azure OpenAI’s own embedding models or Azure Cognitive Search for tighter integration, reduced latency, and simplified security/compliance. Using Gemini (a Google product) in Azure may not be optimal unless there is a strong justification.

- **Vector Database:**  
  Evaluate Azure-native alternatives such as Azure Cognitive Search (with vector search), or managed vector DBs like Pinecone (if multi-cloud is acceptable). This reduces operational burden and improves integration.

- **Hugging Face RAG:**  
  If using Hugging Face models, consider deploying via Azure Machine Learning for better integration, monitoring, and scaling.

### Architectural Improvements

- **Observability:**  
  Explicitly include monitoring/logging (e.g., Azure Monitor, Application Insights) for all components, especially for Chroma and Hugging Face endpoints.

- **Data Flow & Security:**  
  Ensure data movement between clouds (if Gemini is not hosted in Azure) is encrypted and compliant with data residency requirements.

- **Redundancy & High Availability:**  
  Specify failover and backup strategies for Chroma DB and any self-hosted components.

- **Cost Optimization:**  
  Review the cost implications of cross-cloud data transfer and self-hosted services.

---

## 3. ⚠️ Highlight Gaps or Risks

### Potential Risks

- **Cross-Cloud Integration:**  
  Using Gemini Embeddings (Google) in an Azure-centric stack may introduce latency, egress costs, and compliance risks.

- **Operational Overhead:**  
  Chroma is not a managed service on Azure. This increases the burden of maintenance, scaling, patching, and backup.

- **Monitoring & Observability:**  
  No mention of centralized monitoring, alerting, or logging for all components, especially non-Azure services.

- **Fallback Handling:**  
  No details on how the system handles failures in model inference, vector DB unavailability, or API timeouts.

- **Data Governance:**  
  No mention of data encryption at rest/in transit, data retention policies, or audit logging.

### Missing Components

- **Monitoring & Alerting:**  
  Add Azure Monitor, Application Insights, or similar for end-to-end observability.

- **Disaster Recovery:**  
  Define backup, restore, and failover strategies for Chroma and any self-hosted models.

- **API Gateway / Rate Limiting:**  
  Consider Azure API Management for secure, scalable API exposure and throttling.

- **CI/CD Pipeline:**  
  No mention of DevOps practices for deployment and updates.

---

## **Summary Table**

| Area                | Strengths                                              | Gaps/Risks & Suggestions                                 |
|---------------------|-------------------------------------------------------|----------------------------------------------------------|
| Technology Stack    | Azure OpenAI, ADLS, Synapse, Power BI, AAD are strong | Gemini Embeddings (cross-cloud), Chroma (self-hosted)    |
| Integration         | Azure Functions/AKS for orchestration                  | No mention of monitoring, fallback, or DR                |
| Security            | AAD for auth                                           | Data governance, encryption, and compliance not detailed |
| Observability       | N/A                                                    | Add monitoring/logging for all components                |
| Cost & Operations   | Managed Azure services                                 | Cross-cloud egress, self-hosted DB operational overhead  |

---

## **Conclusion**

- The architecture is fundamentally sound and leverages many best-practice Azure services.
- The use of Gemini Embeddings and Chroma DB introduces integration, operational, and compliance risks that should be carefully justified or mitigated.
- Add explicit monitoring, fallback, and disaster recovery mechanisms.
- Consider Azure-native alternatives for embeddings and vector storage for tighter integration, lower operational overhead, and improved security/compliance.

**Action Items:**  
- Re-evaluate embedding and vector DB choices for Azure-native options.  
- Add observability, DR, and security details to the design.  
- Clarify how cross-cloud data flows are secured and monitored.

---

**End of Review**

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = """
You are a Solution Architect and Technical Reviewer.

Your task is to review the **System Design Document** of an AI- RAG based solution implementation. The system uses cloud-native architecture and integrates various AI, data, and analytics components.

Please evaluate the following aspects:

🏗️ System and Solution Design Evaluation**
   - Review architecture technical components such as Azure OpenAI, Hugging Face, Chroma, ADLS, Synapse, and Power BI.
   - Validate the technical componentsused in this solution against business requirement and enterprise-grade cloud best practices (modularity, scalability, fault tolerance).
   - Suggest improvements where applicable.
   -  Confirm whether the chosen technology is the best fit for its purpose in the solution.

You are a Solution Architect and Technical Reviewer.

Your task is to answer the following question in a strict format with tabular format.

👉 Your response in tabulat format should have:
- Technology Component -Example ADLS,Synapse"
-  Best Fit -Yes or NO
-  Explaination - ADLS is good to storing the chat interactions
- "Suggestions.

Then provide a brief explanation (1–3 sentences), based ONLY on the system design document provided.


Do not fabricate details. Base your review **strictly** on the provided system design documentation.



System Design Context:
{context}

Answer:
"""


prompt_template_2 = ChatPromptTemplate.from_template(prompt)

In [None]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_rag_chain = (
    {
        "context": (final_retriever
                      |
                    format_docs),
        "question": RunnablePassthrough()
    }
      |
    prompt_template_2
      |
    chatgpt
)

In [None]:

query = """Evaluate each technical component listed in the system design document for its suitability, scalability, and alignment with Azure best practices.
"""
result = qa_rag_chain.invoke(query)
display(Markdown(result.content))

| Technology Component         | Best Fit | Explanation                                                                                   | Suggestions                                                                                   |
|-----------------------------|----------|----------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|
| Azure OpenAI                | Yes      | Provides enterprise-grade, scalable LLM APIs for natural language understanding and generation.| Ensure quota management and fallback to alternative models if service limits are reached.      |
| Hugging Face RAG            | Yes      | Enables flexible retrieval-augmented generation using custom/internal documents.              | Consider containerizing for portability; monitor for model drift and update as needed.         |
| Gemini Embeddings           | Yes      | High-quality embeddings for semantic search and vectorization of documents/queries.           | Validate compatibility with Chroma; monitor embedding model updates for improvements.          |
| Chroma Vector DB            | Yes      | Efficient vector storage and similarity search; supports sharding and read replicas.          | Evaluate managed vector DB options (e.g., Azure AI Search) for enterprise support and scaling. |
| Azure Data Lake Storage     | Yes      | Secure, scalable storage for raw documents, logs, and audit data.                            | Implement data lifecycle policies and encryption at rest; monitor access patterns.             |
| Azure Synapse Analytics     | Yes      | Scalable analytics for usage, trends, and business KPIs; integrates well with ADLS.           | Optimize data pipelines for cost and performance; automate scaling based on workload.          |
| Power BI                    | Yes      | Robust dashboarding and visualization; supports live and imported data models.                | Use row-level security for sensitive data; monitor refresh schedules for timely insights.      |
| Azure Functions             | Yes      | Serverless orchestration for event-driven processing; auto-scales with demand.                | Monitor cold start latency; use durable functions for long-running workflows if needed.        |
| Azure Kubernetes Service    | Yes      | Hosts scalable RAG inference and orchestration logic; supports horizontal scaling.            | Ensure proper resource limits and autoscaling; implement health checks and rolling updates.    |
| Azure Active Directory      | Yes      | Enterprise-grade authentication and access control; integrates with Azure services.           | Regularly audit access policies; enable conditional access and MFA for enhanced security.      |

**Brief Explanation:**  
The system design leverages modular, cloud-native Azure components and best-in-class AI/analytics tools, ensuring scalability, security, and enterprise readiness. All chosen technologies are appropriate for their roles, with minor suggestions for managed alternatives and operational enhancements to further improve reliability and maintainability.

In [None]:
query = "What are the technical components listed in the system design document"
docs = final_retriever.invoke(query)

In [11]:
from langchain_core.prompts import ChatPromptTemplate

prompt = """
You are a Quality Assurance (QA) Lead and Test Reviewer.

Your task is to review the Test Case Document in the context of the Business Requirement Document (BRD). Based on this, produce a formal QA report that:

1. ✅ **Test Coverage Analysis**
   - Evaluate whether the test cases cover all key business requirements from the BRD.
   - Confirm that all functional, integration, non-functional (performance, security), and edge scenarios have test coverage.
   - Highlight any missing or incomplete test cases.

2. 🧪 **Test Execution Analysis**
   - Review the results of executed test cases (Pass/Fail/Not Executed).
   - Summarize the number and percentage of tests passed, failed, and skipped.
   - Highlight any failed test cases that directly impact critical business flows.

3. 📊 **Comparison with Business Requirements**
   - Cross-map test cases to the stated business goals and validate if each requirement is tested.
   - Identify any requirements that are not tested or insufficiently validated.
   - Call out high-risk gaps or mismatches between the BRD and testing.

4. ⚠️ **Gap Identification & Risk Evaluation**
   - Identify any uncovered use cases, untested integrations, or missing validation steps.
   - Mention risks to production stability or compliance due to incomplete QA.

5. 🚦 **Final Go/No-Go Recommendation**
   - Conclude with a "Go" or "No-Go" release recommendation.
   - Justify the decision clearly based on testing completeness, quality, and alignment with the business goals.

Do not assume or fabricate information. Base your analysis strictly on the provided documents.


Question:
{question}

Context:
{context}
"""


prompt_test_template_1 = ChatPromptTemplate.from_template(prompt)

In [12]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_rag_chain = (
    {
        "context": (final_retriever
                      |
                    format_docs),
        "question": RunnablePassthrough()
    }
      |
    prompt_test_template_1
      |
    chatgpt
)

In [40]:
from IPython.display import Markdown, display
query = "Based on the test case document and the business requirements, evaluate Chatbot Functionality, RAG Flow (Embedding, Retrieval, Generation), Integration with Azure OpenAI and Chroma DB, Role-Based Access Control (Azure AD), Logging & Audit Trail, Analytics & Dashboard, CI/CD and Deployment Validation, Non-Functional Testing (Performance, Load, Security) whether the all test cases are covered , and highlight any of the test cases missed to consider in testing, and sufficient for production release. Provide a detailed QA evaluation report with a final Go/No-Go recommendation"
result = qa_rag_chain.invoke(query)
display(Markdown(result.content))

---

# **QA Evaluation Report: Chatbot Solution**

**Prepared by:** [QA Lead Name]  
**Date:** [Insert Date]  
**Scope:** Review of Test Case Document vs. Business Requirement Document (BRD)  
**Release Recommendation:** See Section 5

---

## 1. ✅ Test Coverage Analysis

### **Business Requirements & Test Coverage**

| Area                                      | Coverage in Test Cases | Comments                                                                                  |
|--------------------------------------------|-----------------------|-------------------------------------------------------------------------------------------|
| **Chatbot Functionality**                  | TC001, TC002, TC005, TC010, TC012, TC013 | Core and edge scenarios (basic, fallback, escalation, multi-turn, invalid/large input) covered. |
| **RAG Flow (Embedding, Retrieval, Generation)** | TC003, TC009, TC011   | Embedding, retrieval accuracy, and similarity tested.                                      |
| **Integration: Azure OpenAI & Chroma DB**  | TC003, TC009, TC011   | Integration validated via retrieval and embedding checks.                                  |
| **Role-Based Access Control (Azure AD)**   | TC006                 | Unauthorized access tested. No explicit test for authorized access or role switching.      |
| **Logging & Audit Trail**                  | **Not Explicitly Covered** | No test case directly validating logging or audit trail functionality.                     |
| **Analytics & Dashboard**                  | TC008                 | Dashboard update after interaction tested.                                                 |
| **CI/CD & Deployment Validation**          | TC014, TC015          | Pipeline and post-deployment checks performed.                                             |
| **Non-Functional (Performance, Load, Security)** | TC004 (Performance), **No explicit Load/Security** | Response time tested. No explicit load or security (e.g., penetration, DDoS) test cases.   |

### **Edge & Integration Scenarios**

- **Edge Cases:** Invalid input, large input, multi-turn, and unsupported queries are covered.
- **Integration:** Azure OpenAI and Chroma DB integration tested via RAG scenarios.
- **Non-Functional:** Only response time (performance) is explicitly tested.

### **Missing/Incomplete Test Cases**

- **Logging & Audit Trail:** No test case to verify that user actions, queries, or admin activities are logged and auditable.
- **Load Testing:** No evidence of high concurrency or sustained load testing.
- **Security Testing:** No explicit test for vulnerabilities, penetration, or compliance (e.g., data leakage, unauthorized access attempts beyond role-based).
- **Role-Based Access (Positive):** Only negative (unauthorized) scenario tested; no positive (authorized) access validation.
- **Integration Failure Handling:** No test for what happens if Chroma DB or Azure OpenAI is unavailable.
- **Data Privacy/Compliance:** No test for GDPR or data retention requirements if applicable.

---

## 2. 🧪 Test Execution Analysis

### **Summary of Results**

- **Total Test Cases Executed:** 21 (15 Functional, 6 Non-Functional)
- **Passed:** 21 (100%)
- **Failed:** 0 (0%)
- **Skipped/Not Executed:** 0

### **Critical Business Flows**

- All executed test cases passed, including those covering core chatbot, RAG, access control, and deployment.
- **No failed test cases** impacting critical business flows.

---

## 3. 📊 Comparison with Business Requirements

### **Cross-Mapping & Validation**

| Business Requirement                | Test Case(s) | Sufficiently Tested? | Comments                                  |
|-------------------------------------|--------------|----------------------|-------------------------------------------|
| Chatbot answers user queries        | TC001, TC010 | Yes                  | Basic and multi-turn covered              |
| Handles unsupported queries         | TC002, TC005 | Yes                  | Fallback and escalation tested            |
| RAG: Embedding, Retrieval, Generation | TC003, TC009, TC011 | Yes          | Accuracy and similarity tested            |
| Integration with Azure OpenAI/Chroma | TC003, TC009, TC011 | Yes          | Via RAG scenarios                        |
| Role-Based Access (Azure AD)        | TC006        | **Partially**        | Only unauthorized access tested           |
| Logging & Audit Trail               | —            | **No**               | No test case for logging/audit            |
| Analytics & Dashboard               | TC008        | Yes                  | Dashboard update tested                   |
| CI/CD & Deployment                  | TC014, TC015 | Yes                  | Pipeline and post-deployment tested       |
| Non-Functional: Performance         | TC004        | Yes                  | Response time tested                      |
| Non-Functional: Load, Security      | —            | **No**               | No explicit load or security tests        |

### **High-Risk Gaps**

- **Logging & Audit Trail:** Not validated; risk to compliance and traceability.
- **Security & Load:** No evidence of penetration or load testing; risk to production stability under attack or high usage.
- **Role-Based Access (Positive):** No test for authorized user access; risk of access misconfiguration.

---

## 4. ⚠️ Gap Identification & Risk Evaluation

### **Uncovered Use Cases / Missing Validation**

- **Logging/Audit:** No validation that actions are logged or auditable.
- **Load/Stress:** No evidence system can handle expected or peak user loads.
- **Security:** No validation against common vulnerabilities (e.g., injection, XSS, privilege escalation).
- **Integration Failure Handling:** No test for graceful degradation if external services fail.
- **Positive Role-Based Access:** No test for correct access for authorized users.

### **Risks to Production Stability/Compliance**

- **Compliance:** Lack of audit/logging tests may violate regulatory or internal audit requirements.
- **Stability:** Absence of load and security testing may lead to outages or breaches under real-world conditions.
- **Access Control:** Incomplete RBAC testing could result in unauthorized data exposure.

---

## 5. 🚦 Final Go/No-Go Recommendation

### **Recommendation: NO-GO**

#### **Justification:**

- **Test Coverage:** While all executed test cases passed, there are critical gaps in logging/audit, security, load, and positive access control validation.
- **Business Alignment:** Not all business and compliance requirements are sufficiently validated.
- **Risk:** Releasing without addressing these gaps exposes the system to compliance, security, and stability risks.

#### **Required Actions Before Go-Live:**

1. **Add and execute test cases for:**
   - Logging & audit trail validation.
   - Load and stress testing (simulate peak user loads).
   - Security/penetration testing (OWASP Top 10, etc.).
   - Positive role-based access (authorized user scenarios).
   - Integration failure handling (simulate external service outages).

2. **Reassess after above tests are executed and passed.**

---

**Summary:**  
The current QA evidence demonstrates strong coverage of core chatbot and RAG flows, but omits critical non-functional and compliance-related validations. The release is **NOT recommended** until these gaps are addressed and validated.

---

In [13]:
!pip install deepeval
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval import evaluate
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric,
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    HallucinationMetric,
    GEval,
)





Collecting deepeval
  Downloading deepeval-3.2.2-py3-none-any.whl.metadata (17 kB)
Collecting anthropic (from deepeval)
  Downloading anthropic-0.55.0-py3-none-any.whl.metadata (27 kB)
Collecting click<8.2.0,>=8.0.0 (from deepeval)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting ollama (from deepeval)
  Downloading ollama-0.5.1-py3-none-any.whl.metadata (4.3 kB)
Collecting portalocker (from deepeval)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting posthog<4.0.0,>=3.23.0 (from deepeval)
  Downloading posthog-3.25.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting pyfiglet (from deepeval)
  Downloading pyfiglet-1.0.3-py3-none-any.whl.metadata (7.4 kB)
Collecting pytest-asyncio (from deepeval)
  Downloading pytest_asyncio-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Collecting pytest-repeat (from deepeval)
  Downloading pytest_repeat-0.9.4-py3-none-any.whl.metadata (4.9 kB)
Collecting pytest-rerunfailures<13.0,>=12.0 (from deepeval)
  Dow

In [38]:
query = "Based on the test case document and the business requirements, evaluate whether the all test cases are covered , and highlight any of the test cases missed to consider in testing, and sufficient for production release. Provide a detailed QA evaluation report with a final Go/No-Go recommendation"
docs = final_retriever.invoke(query)

In [15]:
os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] = "YES"

In [30]:
from langchain_openai import ChatOpenAI
from deepeval.models.base_model import DeepEvalBaseLLM

class LangChainModelWrapper(DeepEvalBaseLLM):
    def __init__(self, model):
        self.model = model

    def generate(self, prompt: str, **kwargs) -> str:
        return self.model.invoke(prompt).content

    async def a_generate(self, prompt: str, **kwargs) -> str:
        # fallback: call sync method for compatibility
        return self.generate(prompt)

    def get_model_name(self) -> str:
        return "LangChain-ChatOpenAI"

    def load_model(self):
        pass




In [18]:
query = "Based on the test case document and the business requirements, evaluate whether the all test cases are covered , and highlight any of the test cases missed to consider in testing, and sufficient for production release. Provide a detailed QA evaluation report with a final Go/No-Go recommendation"


In [16]:
human_answer = """As per the business requirement  Auditing module ,data in report and database validations,integration testing and performance scenarios are not coved in detail"""

In [20]:
test_case = LLMTestCase(
    input=query,
    actual_output=qa_rag_chain.invoke(query).content,
    context=[human_answer]
)

metric = HallucinationMetric(
    threshold=0.6,
    model="gpt-4-turbo",               # ✅ let DeepEval handle the model directly
    include_reason=True,
    verbose_mode=True
)

result = evaluate([test_case], [metric])

Output()



Metrics Summary

  - ❌ Hallucination (score: 1.0, threshold: 0.6, strict: False, evaluation model: gpt-4-turbo, reason: The score is 1.00 because the actual output fails to cover essential details specified in the context, specifically regarding 'Auditing module, data in report and database validations, integration testing and performance scenarios'., error: None)

For test case:

  - input: Based on the test case document and the business requirements, evaluate whether the all test cases are covered , and highlight any of the test cases missed to consider in testing, and sufficient for production release. Provide a detailed QA evaluation report with a final Go/No-Go recommendation
  - actual output: ---

# **QA Evaluation Report: AI Chatbot Solution**

## 1. ✅ Test Coverage Analysis

**a. Coverage of Key Business Requirements**

Based on the summary and test coverage areas, the following business requirements from the BRD appear to be addressed:

- **Chatbot Core Functionality:** Te

In [21]:
print('Sucess:', result.test_results[0].metrics_data[0].success)
print('Score:', result.test_results[0].metrics_data[0].score)
print('Reason:', result.test_results[0].metrics_data[0].reason)

Sucess: False
Score: 1.0
Reason: The score is 1.00 because the actual output fails to cover essential details specified in the context, specifically regarding 'Auditing module, data in report and database validations, integration testing and performance scenarios'.


In [34]:
os.environ["OPENAI_API_KEY"] = api_creds['openai_key']

metric = AnswerRelevancyMetric(
    threshold=0.6,
    model="gpt-4-turbo",               # ✅ let DeepEval handle the model directly
    include_reason=True,
    verbose_mode=True
)

test_case = LLMTestCase(
    input=query,
    actual_output=qa_rag_chain.invoke(query).content,
)

result = evaluate([test_case], [metric])

Output()



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.6, strict: False, evaluation model: gpt-4-turbo, reason: The score is 1.00 because the response fully addresses the input by evaluating the testing against the business requirements and goals, and provides a detailed QA evaluation report with a clear Go/No-Go recommendation, without any irrelevant statements., error: None)

For test case:

  - input: Based on the test case document and the business requirements, evaluate whether the testing is complete, aligned with the business goals, and sufficient for production release. Provide a detailed QA evaluation report with a final Go/No-Go recommendation
  - actual output: ---

# **QA Evaluation Report: AI Chatbot Solution**

## 1. ✅ Test Coverage Analysis

**a. Coverage of Business Requirements:**
- The test case document indicates that testing was performed across all major business requirement areas as outlined in the BRD:
  - **Chatbot Functionality:** Core conversation

In [35]:
print('Sucess:', result.test_results[0].metrics_data[0].success)
print('Score:', result.test_results[0].metrics_data[0].score)
print('Reason:', result.test_results[0].metrics_data[0].reason)

Sucess: True
Score: 1.0
Reason: The score is 1.00 because the response fully addresses the input by evaluating the testing against the business requirements and goals, and provides a detailed QA evaluation report with a clear Go/No-Go recommendation, without any irrelevant statements.


In [31]:

# Your LangChain model
api_key=api_creds['openai_key']
chatgpt = ChatOpenAI(model="gpt-4", api_key=api_key)

# Wrap it
wrapped_model = LangChainModelWrapper(chatgpt)

test_case = LLMTestCase(
    input=query,
    actual_output=qa_rag_chain.invoke(query).content,
)

metric = AnswerRelevancyMetric(
    threshold=0.6,
    model=wrapped_model,
    include_reason=True,
    verbose_mode=True,
    async_mode=False
)

result = evaluate([test_case], [metric])






Output()

AttributeError: 'str' object has no attribute 'statements'

In [None]:
test_case = LLMTestCase(
    input=query,
    actual_output=qa_rag_chain.invoke(query),
)

metric = AnswerRelevancyMetric(
    threshold=0.6,
    model=chatgpt,
    include_reason=True,
    verbose_mode=True
)

result = evaluate([test_case], [metric])

TypeError: Unsupported type for model: <class 'langchain_openai.chat_models.base.ChatOpenAI'>. Expected None, str, DeepEvalBaseLLM, or GPTModel.