# BRD to User Stories

> # Install necessary libraries

In [2]:
!pip install streamlit==1.38.0 \
PyPDF2==3.0.1 \
python-docx==0.8.11 \
python-pptx==0.6.21 \
pandas==2.2.2 \
openai \
langchain \
langchain_openai \
faiss-cpu==1.8.0 \
langchain-community==0.3.0 \
langchain-core==0.3.5 \
scikit-learn \
openpyxl \
pypdf

Collecting pandas==2.2.2
  Using cached pandas-2.2.2-cp39-cp39-win_amd64.whl.metadata (19 kB)
Using cached pandas-2.2.2-cp39-cp39-win_amd64.whl (11.6 MB)
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.1.1
    Uninstalling pandas-2.1.1:
      Successfully uninstalled pandas-2.1.1
Successfully installed pandas-2.2.2


  You can safely remove it manually.
  You can safely remove it manually.


> # Import necessary libraries

In [5]:
import os
import time
import pandas as pd
from PyPDF2 import PdfReader
from docx import Document
import pptx
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from app_secret import OPENAI_API_KEY

> # Set OpenAI API key

In [6]:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
openai_api_key = os.environ["OPENAI_API_KEY"]

> # Setup the Langchain Base Model

In [8]:
llm = ChatOpenAI(
                openai_api_key=os.environ["OPENAI_API_KEY"],
                temperature=0.9,
                max_tokens=1500,
                model_name="gpt-4o"
            )

> # Load the BRD Document

In [17]:
DOCUMENTS_FOLDER = "data/file_folder"

> # Function to extract text from various file types

In [20]:

def extract_text_from_file(filepath):
    """Extracts text based on file type."""
    text = ""
    file_ext = os.path.splitext(filepath)[1].lower()

    # Handle PDF files
    if file_ext == ".pdf":
        with open(filepath, "rb") as file:
            pdf_reader = PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() or ""

    # Handle Word (.docx) files
    elif file_ext == ".docx":
        doc = Document(filepath)
        for para in doc.paragraphs:
            text += para.text + "\n"

    # Handle text (.txt) files
    elif file_ext == ".txt":
        with open(filepath, "r", encoding="utf-8") as file:
            text = file.read()

    # Handle Excel files (.xlsx, .xls)
    elif file_ext in [".xlsx", ".xls"]:
        df = pd.read_excel(filepath)
        text = df.to_string()

    # Handle PowerPoint files (.pptx, .ppt)
    elif file_ext in [".pptx", ".ppt"]:
        ppt = pptx.Presentation(filepath)
        for slide in ppt.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"

    return text

> # Function to process all files and extract combined text

In [21]:
def process_all_files():
    combined_text = ""
    for filename in os.listdir(DOCUMENTS_FOLDER):
        filepath = os.path.join(DOCUMENTS_FOLDER, filename)
        combined_text += extract_text_from_file(filepath)
    return combined_text

> # Function to create vector store from extracted text

In [23]:
def create_vector_store(text):
    text_splitter = RecursiveCharacterTextSplitter(
        separators="\n",
        chunk_size=800,
        chunk_overlap=50,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    # Pass the API key as a keyword argument
    embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
    return FAISS.from_texts(chunks, embeddings)

> # Main logic to process documents, create vector store, and handle user query

In [25]:
# Main logic to process documents, create vector store, and handle user query
if __name__ == "__main__":
    print("Starting BRD to User Story conversion...")

    start_time = time.time()
    text = process_all_files()

    # If text extraction is successful, set up vector store and allow queries
    if text:
        vector_store = create_vector_store(text)

        # Define the prompt message
        prompt_message = (
            "You are a senior business analyst responsible for analyzing a Business Requirement Document (BRD). "
            "Carefully read through the BRD, understand the business needs, and write detailed user stories that capture "
            "the functional and non-functional requirements. Each user story should follow the format: "
            "As a [type of user], I want [an action/feature] so that [goal or benefit]. Ensure user stories are clear, "
            "detailed, and reflect all necessary acceptance criteria and edge cases. Consider all possible scenarios and "
            "workflows to ensure the stories fully align with the requirements and objectives in the BRD. Write the user "
            "stories step-by-step and cover all business needs specified in the document."
        )

        # Example of a specific query to use with the prompt message
        user_question = "Generate user stories based on the BRD content."

        if prompt_message:
            start_query_time = time.time()
            matches = vector_store.similarity_search(prompt_message, k=3)  # Retrieve top 3 similar texts
            print("Matches:", matches)

            # Set up the retrieval QA chain with the prompt
            qa_chain = RetrievalQA.from_chain_type(
                llm=llm,
                chain_type="stuff",
                retriever=vector_store.as_retriever()
            )
            response = qa_chain.invoke({"query": prompt_message})
            print("\nResponse:", response['result'])

            # Display timing info for performance insights
            print(f"\nDocument loading time: {time.time() - start_time:.2f} seconds")
            print(f"Query processing time: {time.time() - start_query_time:.2f} seconds")

Starting BRD to User Story conversion...
Matches: [Document(metadata={}, page_content="By incorporating user stories and use cases, developers can create a system that aligns with users' \neveryday workflows.  \n10. Implementation Timeline  \nThe BRD should outline a realistic timeline for the EMS project, including key phases like:  \n1. Requirements Gathering : Understanding the company’s needs, interviewing \nstakeholders, and finalizing requirements.  \n2. Design Phase : Creating wireframes, system architecture, and database design.  \n3. Development : Coding the EMS based on functional and non -functional requirements.  \n4. Testing : Conducting rigorous testing to identify and fix bugs, including usability, \nperformance, and security testing.  \n5. Deployment : Launching the EMS and conducting user training sessions."), Document(metadata={}, page_content='The Business Requirement Document  is a formal report that outlines a project’s purpose, \nscope, and requirements from a bus

check the accuracy of this model using trulens as percentage of accuracy

In [None]:
# prompt: check the accuracy of this model as percentage of accuracy

# Assuming you have a list of ground truth answers and a list of model predictions
ground_truth_answers = ["Answer 1", "Answer 2", "Answer 3", "Answer 4"]
model_predictions = ["Answer 1", "Answer 2", "Answer 3", "Answer 4"]

# Calculate the number of correct predictions
correct_predictions = 0
for ground_truth, prediction in zip(ground_truth_answers, model_predictions):
    if ground_truth == prediction:
        correct_predictions += 1

# Calculate the accuracy as a percentage
accuracy = (correct_predictions / len(ground_truth_answers)) * 100

print(f"Model Accuracy: {accuracy:.2f}%")

