In [2]:
""" !pip install pytube
!pip install openai-whisper
!pip install langchain
!pip install openai
!pip install python-dotenv """

' !pip install pytube\n!pip install openai-whisper\n!pip install langchain\n!pip install openai\n!pip install python-dotenv '

In [4]:
""" !pip install -U langchain-community
!pip install pinecone-client
!pip install --upgrade pytube
!pip install yt-dlp
!pip install imageio[ffmpeg]
!pip install gradio 
!pip install google-search-results """

' !pip install -U langchain-community\n!pip install pinecone-client\n!pip install --upgrade pytube\n!pip install yt-dlp\n!pip install imageio[ffmpeg]\n!pip install gradio \n!pip install google-search-results '

In [5]:
from torch import cuda, torch
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [6]:
# Standard library imports
import os
import getpass
import json
import time

# Third-party library imports
from torch import cuda
from dotenv import load_dotenv, find_dotenv
import yt_dlp
from whisper import load_model
from tqdm.auto import tqdm
import gradio as gr
from serpapi import GoogleSearch

# Pinecone SDK imports
from pinecone import Pinecone, ServerlessSpec

# LangChain imports
from langchain.agents import initialize_agent, Tool
from langchain.schema import HumanMessage, SystemMessage
from langchain.memory import ConversationBufferMemory
from langchain_community.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone as LangChainPinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:

# Load Environment Variable
_ = load_dotenv(find_dotenv())

# Set the environment variable directly in the script
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")
os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter Pinecone API key: ")
os.environ["SERPAPI_API_KEY"] = getpass.getpass("Enter SerpAPI key: ")
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter LANGCHAIN_API_KEY: ")

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "prj-yt-chatbot-prefinal"


In [16]:
# STEP 1: Load VIDEO, Download Audio, Transcribe into different text files!!!

import os
import yt_dlp
from whisper import load_model
from langchain.agents import initialize_agent, Tool
from langchain.schema import HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI

# Default configurations
VIDEO_LINKS_FILE = "video_links.txt"
AUDIO_OUTPUT_TEMPLATE = "audio_{}"
WHISPER_MODEL = "base"
TRANSCRIPT_DIR = "transcriptions"

# Ensure directory exists for transcriptions
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)

# Load YouTube links from the file
def load_video_links(filename=VIDEO_LINKS_FILE):
    with open(filename, 'r') as file:
        return [link.strip() for link in file.readlines() if link.strip()]

# Download audio from YouTube video
def download_audio(url, output_name):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}],
        'outtmpl': f"{output_name}",
        'ffmpeg_location': '/usr/bin/ffmpeg'
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return f"{output_name}.mp3"

# Transcribe audio using Whisper
def transcribe_audio_with_whisper(audio_path, model_name=WHISPER_MODEL):
    model = load_model(model_name)
    result = model.transcribe(audio_path)
    return result["text"]

# Process each video and save individual transcriptions
def transcribe_all_videos():
    video_links = load_video_links(VIDEO_LINKS_FILE)

    for i, video_url in enumerate(video_links):
        print(f"Processing video {i+1}/{len(video_links)}: {video_url}")
        audio_path = download_audio(video_url, AUDIO_OUTPUT_TEMPLATE.format(i))
        
        # Transcribe the audio and save each transcription to a separate file
        transcription = transcribe_audio_with_whisper(audio_path)
        transcript_file = os.path.join(TRANSCRIPT_DIR, f"transcription_{i+1}.txt")
        
        with open(transcript_file, 'w') as f:
            f.write(transcription)
        
        print(f"Completed transcription for video {i+1}")

# Execute transcription process only, without consolidation
if __name__ == "__main__":
    transcribe_all_videos()
    print("Individual transcription files are ready in the transcriptions folder.")


Processing video 1/8: https://www.youtube.com/watch?v=-RIfno4Sp0g
[youtube] Extracting URL: https://www.youtube.com/watch?v=-RIfno4Sp0g
[youtube] -RIfno4Sp0g: Downloading webpage
[youtube] -RIfno4Sp0g: Downloading ios player API JSON
[youtube] -RIfno4Sp0g: Downloading mweb player API JSON
[youtube] -RIfno4Sp0g: Downloading player 81ef9024
[youtube] -RIfno4Sp0g: Downloading m3u8 information
[info] -RIfno4Sp0g: Downloading 1 format(s): 251
[download] Destination: audio_0
[download] 100% of   48.70MiB in 00:00:00 at 51.13MiB/s    
[ExtractAudio] Destination: audio_0.mp3
Deleting original file audio_0 (pass -k to keep)


100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 57.6MiB/s]


Completed transcription for video 1
Processing video 2/8: https://www.youtube.com/watch?v=3SCOVB1KDFc
[youtube] Extracting URL: https://www.youtube.com/watch?v=3SCOVB1KDFc
[youtube] 3SCOVB1KDFc: Downloading webpage
[youtube] 3SCOVB1KDFc: Downloading ios player API JSON
[youtube] 3SCOVB1KDFc: Downloading mweb player API JSON
[youtube] 3SCOVB1KDFc: Downloading m3u8 information
[info] 3SCOVB1KDFc: Downloading 1 format(s): 251
[download] Destination: audio_1
[download] 100% of   54.16MiB in 00:00:02 at 18.42MiB/s    
[ExtractAudio] Destination: audio_1.mp3
Deleting original file audio_1 (pass -k to keep)
Completed transcription for video 2
Processing video 3/8: https://www.youtube.com/watch?v=qMqP2IwhSnA
[youtube] Extracting URL: https://www.youtube.com/watch?v=qMqP2IwhSnA
[youtube] qMqP2IwhSnA: Downloading webpage
[youtube] qMqP2IwhSnA: Downloading ios player API JSON
[youtube] qMqP2IwhSnA: Downloading mweb player API JSON
[youtube] qMqP2IwhSnA: Downloading m3u8 information
[info] qMqP2Iw

In [8]:
#print(os.environ["PINECONE_API_KEY"])
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [9]:
# Create INDEX 

import os
import time
from tqdm.auto import tqdm
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone client with the updated API
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
#spec = ServerlessSpec(cloud="aws", region="us-west-1")          # Specify Pinecone environment
index_name = "transcription-accnt-earn-index-final"             # Create New Index

# Check if the index exists; create if it doesn’t
if index_name not in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=1536,  # OpenAI embedding dimension
        metric="dotproduct",
        spec=spec
    )
    # Wait for the index to be ready
    while not pc.describe_index(index_name).status["ready"]:
        print("Waiting for index to be ready...")
        time.sleep(1)

# Connect to the index for upsertion
index = pc.Index(index_name)
time.sleep(1)
# view index stats
print("Index statistics:", index.describe_index_stats())

Index statistics: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [10]:
# STEP 3: Vectorize and Embed each Transcription file

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

# Initialize embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Function to split and vectorize text from each transcription file
def vectorize_individual_transcriptions(transcription_dir="transcriptions"):
    # Initialize list to hold vectorized segments for all files
    all_vectorized_segments = []
    
    # Iterate through each transcription file in the directory
    for filename in sorted(os.listdir(transcription_dir)):
        file_path = os.path.join(transcription_dir, filename)
        
        # Skip directories and hidden files
        if not os.path.isfile(file_path) or filename.startswith('.'):
            continue

        # Read transcription text
        with open(file_path, 'r') as f:
            transcription_text = f.read()
        
        # Split text into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
        chunks = text_splitter.split_text(transcription_text)
        
        # Vectorize each chunk with metadata
        for chunk_idx, chunk in enumerate(chunks):
            embedding = embedding_model.embed_documents([chunk])[0]  # Get embedding
            
            # Metadata includes file identifier and chunk index for reference
            metadata = {
                "file_name": filename,
                "file_id": filename.replace("transcription_", "").replace(".txt", ""),
                "chunk_idx": chunk_idx,
            }
            all_vectorized_segments.append({
                "embedding": embedding,
                "text": chunk,
                "metadata": metadata
            })
        
        print(f"Vectorization complete for {filename}.")

    return all_vectorized_segments

# Execute the vectorization task
if __name__ == "__main__":
    vectorized_segments = vectorize_individual_transcriptions()
    print("All transcription files have been vectorized. Ready for storage in Pinecone.")


  embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")


Vectorization complete for transcription_1.txt.
Vectorization complete for transcription_2.txt.
Vectorization complete for transcription_3.txt.
Vectorization complete for transcription_4.txt.
Vectorization complete for transcription_5.txt.
Vectorization complete for transcription_6.txt.
Vectorization complete for transcription_7.txt.
Vectorization complete for transcription_8.txt.
All transcription files have been vectorized. Ready for storage in Pinecone.


In [11]:
# STEP 4 - UPSERT embeddings to Pinecone

from tqdm.auto import tqdm

# Function to upsert vectorized segments to Pinecone
def upsert_embeddings_to_pinecone(vectorized_segments, index):
    for segment in tqdm(vectorized_segments, desc="Upserting segments to Pinecone"):
        # Prepare data for upsert
        segment_id = segment["metadata"]["file_id"] + "_" + str(segment["metadata"]["chunk_idx"])  # Unique ID for each chunk
        embedding = segment["embedding"]
        metadata = segment["metadata"]
        metadata["text"] = segment["text"]  # Adds the chunk's text content

        # Upsert the segment into Pinecone
        try:
            index.upsert(vectors=[(segment_id, embedding, metadata)])
        except Exception as e:
            print(f"Error upserting segment {segment_id}: {e}")

    print("All segments upserted into Pinecone.")

# Example usage:
upsert_embeddings_to_pinecone(vectorized_segments, index)


Upserting segments to Pinecone:   0%|          | 0/349 [00:00<?, ?it/s]

All segments upserted into Pinecone.


In [12]:
# Example usage in SerpAPI tool setup
serp_api_key = os.environ["SERPAPI_API_KEY"]

In [13]:
# DEFINE ROUTER, RETRIEVE from VectorstoreDB and Retrieve from Google search functions!

from langchain.schema import HumanMessage, SystemMessage, AIMessage


# Initialize the embedding model and vector store
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.environ["OPENAI_API_KEY"])
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])

# Pinecone index is already set up and embeddings are upserted
vector_store = LangChainPinecone(index=index, embedding=embedding_model, text_key="text")

# Initialize the router instructions
router_instructions = """You are an expert at routing a user question to a vectorstore or web search.

The vectorstore contains documents related to Accenture company Quarterly results, Revenue, Earnings for the last four quarters, sales, digital projects, cloud projects and employee details.

Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.

Return JSON with a single key, "datasource", that is either 'websearch' or 'vectorstore' depending on the question."""

# Function to route query based on instructions
def route_query(query):
    routing_response = chat_model.invoke(
        [SystemMessage(content=router_instructions), HumanMessage(content=query)]
    )
    route_decision = json.loads(routing_response.content)
    return route_decision["datasource"]

# Function to retrieve from vector database
def retrieve_from_vectorstore(query):
    retriever = vector_store.as_retriever()
    results = retriever.get_relevant_documents(query)
    document_text = "\n\n".join([doc.page_content for doc in results])

    # Summarize the retrieved documents using LLM 
    prompt = (
        "Based on the following documents, provide a concise answer to the question:\n\n"
        f"Question: {query}\n\n"
        "Documents:\n"
        f"{document_text}\n\n"
        "Answer:"
    )    
    answer_response = chat_model.invoke([HumanMessage(content=prompt)])
    return answer_response.content
    
# Function to retrieve from Google search
def web_search(query):
    search = GoogleSearch({
        "q": query,
        "api_key": serp_api_key, 
        "num": 3  # Limit to top 3 results for conciseness
    })
    results = search.get_dict()

    # Extract relevant text from search results
    combined_text = "\n".join(
        result.get("snippet", "No snippet available")
        for result in results.get("organic_results", [])
    )
    
    # Summarize results using the LLM
    prompt = (
        f"Summarize the following search results to answer the question:\n\n"
        f"Question: {query}\n\n"
        f"Results:\n{combined_text}\n\n"
        "Answer:"
    )
    summary_response = chat_model.invoke([HumanMessage(content=prompt)])
    return summary_response.content  


# Main function to handle the query routing and retrieval
def handle_query(query):
    datasource = route_query(query)
    
    if datasource == "vectorstore":
        print("Routing to Vectorstore...")
        return retrieve_from_vectorstore(query)
    elif datasource == "websearch":
        print("Routing to Web Search...")
        return web_search(query)
    else:
        return "Unknown data source. Please check the router configuration."

  chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
  vector_store = LangChainPinecone(index=index, embedding=embedding_model, text_key="text")


In [14]:
# DEFINE ALL THE TOOLS for ReACT agent!  RUN IT - this includes Chat history!

from langchain.agents import initialize_agent
from langchain.agents import Tool
from langchain.chat_models import ChatOpenAI
from serpapi import GoogleSearch
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import os

# Initialize memory
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
memory = ConversationBufferWindowMemory(memory_key="chat_history", k=5, return_messages=True)

# Define the Web search tool using SerpAPI
web_search_tool = Tool(
    name="Google Search",
    func=web_search,
    description="Retrieve relevant information using google web search for general or current event questions."
)

# Define tool for vector store retrieval
vector_store_tool = Tool(
    name="VectorStore Retrieval",
    func=retrieve_from_vectorstore,
    description="Retrieve relevant documents from the vector store for questions about Accenture's earnings, revenue, sales, projects, and employee details.",
)

# Revised prompt for the agent with improved answer formatting
agent_prompt = """
You are an expert assistant specializing in Accenture's financial data and competitor analysis.

For each user query:
1. Answer strictly based on provided context or vector data.
2. Be concise and use up to three sentences.
3. If the answer is found, respond directly by starting with "Final Answer:" without additional steps or thoughts.

Now, respond to the question:
{user_query}
"""

# Define a prompt template with placeholder for messages
prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content="You are a helpful assistant specializing in Accenture's financial data. Answer questions accurately."
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

chat_history = []

def add_to_chat_history(human_message, ai_message=None):
    """Add user and AI responses to the chat history."""
    chat_history.append(HumanMessage(content=human_message))
    if ai_message:
        chat_history.append(AIMessage(content=ai_message))


# Define the ReAct agent with the custom prompt in each tool
react_agent = initialize_agent(
    tools=[vector_store_tool, web_search_tool],
    agent="zero-shot-react-description",
    llm=chat_model,
    memory=memory,  # Integrate conversation memory
    verbose=True,
    max_iterations=5,  # Restrict maximum depth to avoid infinite loops
    handle_parsing_errors=True  # Retry on output parsing errors
)


# Main function to handle the query using the ReAct agent with the custom prompt
def handle_query_with_agent(query):
    # Format the query with the agent's prompt
    formatted_prompt = agent_prompt.format(user_query=query)
    
    # Add the user query to the chat history
    add_to_chat_history(query)

    # Use the prompt in the agent invocation
    try:
        response = react_agent({"input": formatted_prompt})
        # If a valid answer is provided, return it
        if "output" in response:
            ai_message = response["output"]
            add_to_chat_history(query, ai_message)
            return ai_message
            #return response["output"]
        else:
            # Fallback message if no answer found within the max iterations
            return "I don't know. Unable to find the answer within the provided context."
    except Exception as e:
        # In case of unexpected errors, provide a helpful message
        return f"Error encountered: {str(e)}"

  memory = ConversationBufferWindowMemory(memory_key="chat_history", k=5, return_messages=True)
  react_agent = initialize_agent(


In [15]:
# INTERACTIVE CHAT FUNCTION using ReACT Agent - REFINED 3! Works perfect! TUESDAY

import os
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory

# Initialize chat model and memory
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
memory = ConversationBufferWindowMemory(memory_key="chat_history", k=5, return_messages=True)


# Function to clear memory and chat history
def reset_memory():
    """Clear memory and chat history."""
    memory.clear()
    chat_history.clear()
    print("Memory and chat history have been reset.")

# Interactive chat function with datasource-based source tagging
def chatbot_interaction():
    print("Welcome to the Earnings Q&A Chatbot! Ask any question about Accenture.")
    while True:
        user_query = input("You: ")
        if user_query.lower() in ["exit", "quit", "no"]:
            print("Agent: Thank you for using the Q&A chatbot. Goodbye!")
            break
        
        # Determine the datasource
        datasource = route_query(user_query)
        
        # Tag source based on datasource
        source_label = "VectorDB" if datasource == "vectorstore" else "Google Search" if datasource == "websearch" else "Unknown Source"        
        
        try:
            # Get response from the handle_query_with_agent function instead of calling react_agent directly
            answer = handle_query_with_agent(user_query)
            print(f"\nAgent (using {source_label}):\nQuestion: {user_query}\nAnswer: {answer}")
        
        except Exception as e:
            print(f"Error encountered: {e}")
            print("Agent: Unable to process the request. Please try again.")
        
        print("\nAgent: Do you have more questions?")

# Start the chatbot interaction
chatbot_interaction()

Welcome to the Earnings Q&A Chatbot! Ask any question about Accenture.


  response = react_agent({"input": formatted_prompt})




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use the VectorStore Retrieval tool to find information about Accenture's EPS in the fourth quarter of 2024.
Action: VectorStore Retrieval
Action Input: EPS fourth quarter 2024 Accenture[0m

  results = retriever.get_relevant_documents(query)



Observation: [36;1m[1;3mThe EPS for the fourth quarter of 2024 for Accenture is expected to be in the range of 8 to 10 percent growth in local currency over fiscal 2023.[0m
Thought:[32;1m[1;3mI now know the final answer
Final Answer: The EPS in the fourth quarter of 2024 for Accenture is expected to be in the range of 8 to 10 percent growth in local currency over fiscal 2023.[0m

[1m> Finished chain.[0m

Agent (using VectorDB):
Question: What is the EPS in fourth quarter 2024?
Answer: The EPS in the fourth quarter of 2024 for Accenture is expected to be in the range of 8 to 10 percent growth in local currency over fiscal 2023.

Agent: Do you have more questions?


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use the VectorStore Retrieval tool to find the total number of employees in Accenture.
Action: VectorStore Retrieval
Action Input: Total number of employees in Accenture[0m
Observation: [36;1m[1;3mApproximately 733,000 employees in Accenture.[0m


In [38]:
def display_chat_history():
    """Display the current chat history."""
    for message in chat_history:
        role = "User" if isinstance(message, HumanMessage) else "AI"
        print(f"{role}: {message.content}\n")
display_chat_history()


User: What is the revenue of Accenture in the fourth quarter of 2024?

User: What is the revenue of Accenture in the fourth quarter of 2024?

AI: In the fourth quarter of 2024, Accenture's revenue was $65 billion.

User: How does this compare to the third quarter in terms of revenue?

User: How does this compare to the third quarter in terms of revenue?

AI: In the third quarter, Accenture's revenue was $16.4 billion, showing a 5% increase in local currency.

User: what was the trend over the last 3 quarters on EPS.. give me the numbers?

User: what was the trend over the last 3 quarters on EPS.. give me the numbers?

AI: The trend for Accenture's EPS over the last 3 quarters has shown consistent growth, with adjusted EPS of $3.19 in the most recent quarter, $11.67 in the quarter before that, and positive growth percentages.

User: how big is the GenAI project in 2024?

User: how big is the GenAI project in 2024?

AI: The GenAI project in 2024 is expected to be over $450 million in gen

In [None]:
# GRADIO only with TEXT - REFINED! Works Perfect!

import json
import openai
import gradio as gr

def gradio_chatbot(user_query):
    # Determine the source using the routing function
    datasource = route_query(user_query)
    
    # Tag source based on the datasource returned
    source_label = "VectorDB" if datasource == "vectorstore" else "Google Search" if datasource == "websearch" else "Unknown Source"
    
    try:
        # Get response from the ReAct agent
        response = react_agent({"input": user_query})

        # Check if the response is not empty and parse the response JSON if possible
        if response and "output" in response:
            output_content = response["output"]
            
            # Attempt to parse JSON response if available
            try:
                parsed_response = json.loads(output_content) if isinstance(output_content, str) else output_content
                answer = parsed_response.get("answer", output_content) if isinstance(parsed_response, dict) else output_content
            except json.JSONDecodeError:
                answer = output_content  # Fallback to raw output if JSON parsing fails

            return f"Answer (from {source_label}):\n{answer}"

        else:
            return "Error: Received an empty or unexpected response format from the agent."

    except Exception as e:
        # Capture any other errors
        return f"Error encountered: {str(e)}"

# Create the Gradio Interface
iface = gr.Interface(
    fn=gradio_chatbot,  # Function to call
    inputs="text",      # Input type: text
    outputs="text",     # Output type: text
    title="Earnings Q&A Chatbot",
    description="Ask any question about Accenture and get answers with sourced information."
)

# Launch the Gradio app
iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://d50010d4b9a96855ac.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)






[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to find information about Accenture's revenue for the year 2024.
Action: VectorStore Retrieval
Action Input: revenue 2024[0m
Observation: [36;1m[1;3mFor fiscal year 2024, revenue is expected to be in the range of 1 to 3 percent growth in local currency, with an inorganic contribution approaching 3 percent. This growth is expected to lead to adjusted earnings per share in the range of $11.97 to $12.20, representing a 3 to 5 percent increase over fiscal 23 results. The company also plans to return at least $7.7 billion to shareholders through dividends and share repurchases.[0m
Thought:[32;1m[1;3mI need to calculate the total revenue based on the provided information.
Action: Calculate
Action Input: $11.97 billion (lower end) to $12.20 billion (upper end)[0m
Observation: Calculate is not a valid tool, try one of [VectorStore Retrieval, Google Search].
Thought:[32;1m[1;3mI can calculate the total revenue by takin

In [17]:
# GISKARD EVALUATION BELOW !!!

!pip install --default-timeout=100 "giskard[llm]"
from giskard.rag import KnowledgeBase, generate_testset, evaluate


Collecting giskard[llm]
  Downloading giskard-2.15.3-py3-none-any.whl.metadata (15 kB)
Collecting zstandard>=0.10.0 (from giskard[llm])
  Downloading zstandard-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting mlflow-skinny>=2 (from giskard[llm])
  Downloading mlflow_skinny-2.17.2-py3-none-any.whl.metadata (30 kB)
Collecting mixpanel>=4.4.0 (from giskard[llm])
  Downloading mixpanel-4.10.1-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langdetect>=1.0.9 (from giskard[llm])
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting num2words>=0.5.13 (from giskard[llm])
  Downloading num2words-0.5.13-py3-none-any.whl.metadata (12 kB)
Collecting griffe<0.49.0,>=0.36.9 (from giskard[llm])
  Downloading griffe-0.48.0-py3-none-any.whl.metadata (5.1 kB)
Collecting evaluate>=0

2024-11-08 08:37:28.560682: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-08 08:37:28.560785: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-08 08:37:28.650193: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-08 08:37:28.829628: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
import pandas as pd

# Define the sample questions to test with `handle_query`
# Define the list of queries to retrieve answers for
queries = [
    "What is the revenue of Accenture in third quarter of 2024?",
    "What is the EPS of Accenture in fourth quarter of 2024?",
    "Who are the competitors of Accenture?",
    "What is the size of Genai project deals for Accenture in year 2024?",
    "what is the total employee count in Accenture today??"    
]

# Collect answers by running each query through the handle_query function
answers = [handle_query(query) for query in queries]

# Create a DataFrame with only the answers
df_giskard = pd.DataFrame(answers, columns=["text"])

# Display the DataFrame to verify
df_giskard.head()

Routing to Vectorstore...
Routing to Vectorstore...
Routing to Web Search...
Routing to Web Search...
Routing to Web Search...


Unnamed: 0,text
0,The revenue of Accenture in the third quarter ...
1,"Based on the documents provided, the EPS of Ac..."
2,"The competitors of Accenture include IBM, PwC,..."
3,Accenture booked $900 million in generative AI...
4,The total employee count in Accenture today is...


In [19]:
# Build KNOWLEDGEBASE for Giskard

kb_giskard = KnowledgeBase(df_giskard)

In [21]:
# Test Question using Giskard

test_questions = generate_testset(
    kb_giskard,
    num_questions=10,
    agent_description="Accenture Q&A chat assistant related to Quarterly Earnings, and general information like competition",
)

Generating questions:   0%|          | 0/10 [00:00<?, ?it/s]

In [22]:
df_test_questions = test_questions.to_pandas()
df_test_questions.head()

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c4dbc5b9-1645-488b-872f-05689893cf7e,What was the total amount of business Accentur...,Accenture booked $2 billion in generative AI b...,Document 3: Accenture booked $900 million in g...,[],"{'question_type': 'simple', 'seed_document_id'..."
20170301-7d0b-43b0-90a1-04a92746885d,What is the expected revenue range for Accentu...,The revenue of Accenture in the third quarter ...,"Document 1: Based on the documents provided, t...",[],"{'question_type': 'simple', 'seed_document_id'..."
9da1ef48-bc5a-4fed-9059-12874e742006,Could you specify the projected revenue range ...,The revenue of Accenture in the third quarter ...,"Document 1: Based on the documents provided, t...",[],"{'question_type': 'complex', 'seed_document_id..."
01ab52e1-73c8-4554-814e-8883f249177c,What is the present total number of employees ...,The total employee count in Accenture today is...,Document 4: The total employee count in Accent...,[],"{'question_type': 'complex', 'seed_document_id..."
bbfe65de-bb3f-4d3e-b922-e90d87c6df69,Considering the $900 million booked in generat...,The revenue of Accenture in the third quarter ...,"Document 1: Based on the documents provided, t...",[],"{'question_type': 'distracting element', 'seed..."


In [23]:
def use_agent(question, history=None):
    # Call the react_agent with question input
    try:
        response = react_agent.invoke({"input": question})  # `invoke` ensures it’s a direct LLM call
        return response["output"]
    except Exception as e:
        print(f"Error during agent invocation: {e}")
        return "Unable to process the question at this time."

In [24]:
report = evaluate(use_agent, testset=test_questions, knowledge_base=kb_giskard)

Asking questions to the agent:   0%|          | 0/10 [00:00<?, ?it/s]



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use VectorStore Retrieval to find information about Accenture's business in generative AI during the fiscal year 2024.
Action: VectorStore Retrieval
Action Input: Accenture fiscal year 2024 generative AI business[0m
Observation: [36;1m[1;3mAccenture is investing heavily in generative AI technology for FY24, with significant sales growth and a focus on helping clients realize value at scale. They have sold over 100 generative AI projects in the last four months and are working with various industries to improve customer service, simplify operations, and drive audience engagement. Additionally, Accenture is expanding their workforce and investing in AI to accelerate their clients' reinvention journey.[0m
Thought:[32;1m[1;3mThere is no specific mention of the total amount booked in generative AI for fiscal year 2024 in the retrieved information. I should try to rephrase the question and search for more specific de

CorrectnessMetric evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

In [25]:
# Summary with the results.
report.correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,0.0
conversational,0.0
distracting element,0.0
double,1.0
simple,0.0
situational,0.5


In [26]:
# Obtaining the incorrect answers
failures = report.get_failures()[:15]
failures

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata,agent_answer,correctness,correctness_reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
c4dbc5b9-1645-488b-872f-05689893cf7e,What was the total amount of business Accentur...,Accenture booked $2 billion in generative AI b...,Document 3: Accenture booked $900 million in g...,[],"{'question_type': 'simple', 'seed_document_id'...",$900 million,False,The agent stated that Accenture booked $900 mi...
20170301-7d0b-43b0-90a1-04a92746885d,What is the expected revenue range for Accentu...,The revenue of Accenture in the third quarter ...,"Document 1: Based on the documents provided, t...",[],"{'question_type': 'simple', 'seed_document_id'...",Accenture's expected revenue range for the thi...,False,The agent stated that Accenture's expected rev...
9da1ef48-bc5a-4fed-9059-12874e742006,Could you specify the projected revenue range ...,The revenue of Accenture in the third quarter ...,"Document 1: Based on the documents provided, t...",[],"{'question_type': 'complex', 'seed_document_id...",The projected revenue range for Accenture duri...,False,The agent stated that the projected revenue ra...
01ab52e1-73c8-4554-814e-8883f249177c,What is the present total number of employees ...,The total employee count in Accenture today is...,Document 4: The total employee count in Accent...,[],"{'question_type': 'complex', 'seed_document_id...","Accenture has approximately 733,000 employees ...",False,The agent stated that Accenture has approximat...
bbfe65de-bb3f-4d3e-b922-e90d87c6df69,Considering the $900 million booked in generat...,The revenue of Accenture in the third quarter ...,"Document 1: Based on the documents provided, t...",[],"{'question_type': 'distracting element', 'seed...",The expected revenue range for Accenture in Q3...,False,The agent stated that the expected revenue ran...
8e415d4f-310a-4710-9b4f-f03d9ebafe61,Given that the documents only provide informat...,The revenue of Accenture in the third quarter ...,Document 0: The revenue of Accenture in the th...,[],"{'question_type': 'distracting element', 'seed...","For Q3 2024, Accenture is expected to have rev...",False,The agent stated that the expected revenue ran...
78868c2e-eaac-4d69-82a9-1a6d7fedede9,As an investor looking to allocate my funds wi...,"The competitors of Accenture include IBM, PwC,...",Document 2: The competitors of Accenture inclu...,[],"{'question_type': 'situational', 'seed_documen...",Some of Accenture's competitors in the market ...,False,The agent missed out on mentioning KPMG and Er...
63296cbe-b38a-45f5-8967-f1bb571062eb,What is it?,The revenue of Accenture in the third quarter ...,"Document 1: Based on the documents provided, t...","[{'role': 'user', 'content': 'I'm interested i...","{'question_type': 'conversational', 'seed_docu...",Accenture is a global professional services co...,False,The agent provided a description of Accenture ...


In [27]:
# Print all values in 'question' and 'agent_answer' columns
for i, (question, agent_answer) in enumerate(zip(failures['question'], failures['agent_answer'])):
    print(f"Question {i+1}: {question}")
    print(f"Agent Answer {i+1}: {agent_answer}\n")

Question 1: What was the total amount of business Accenture booked in generative AI during the fiscal year 2024?
Agent Answer 1: $900 million

Question 2: What is the expected revenue range for Accenture in the third quarter of 2024?
Agent Answer 2: Accenture's expected revenue range for the third quarter of 2024 is between 15.75 billion to 16.35 billion.

Question 3: Could you specify the projected revenue range for Accenture during the third quarter of the fiscal year 2024?
Agent Answer 3: The projected revenue range for Accenture during the third quarter of the fiscal year 2024 is estimated to be between 16.1 billion to 16.7 billion, with an expected growth of 3 to 7 percent in local currency.

Question 4: What is the present total number of employees working for Accenture globally?
Agent Answer 4: Accenture has approximately 733,000 employees globally as of the end of FY23.

Question 5: Considering the $900 million booked in generative AI business during Q3 2024, what is the expect

In [None]:
# Giskard explains the reasons why it considers the answers to be incorrect.
#failures['correctness_reason'].iloc[1]
for i, reason in enumerate(failures['correctness_reason']):
    print(f"Failure {i+1}: {reason}\n")

Failure 1: The agent stated that Accenture booked $900 million in generative AI business during the fiscal year 2024, but should have said $2 billion.

Failure 2: The agent stated that Accenture's expected revenue range for the third quarter of 2024 is between 15.75 billion to 16.35 billion, but should have said it is between 15.85 billion to 16.45 billion.

Failure 3: The agent stated that the projected revenue range for Accenture during the third quarter of the fiscal year 2024 is estimated to be between 16.1 billion to 16.7 billion, but the reference answer indicates that it should be between 15.85 to 16.45 billion.

Failure 4: The agent stated that Accenture has approximately 733,000 employees globally, but the correct number is 774,000.

Failure 5: The agent stated that the expected revenue range for Accenture in Q3 2024 is between $16.1 billion and $16.7 billion, but the correct range should be between $15.85 billion and $16.45 billion.

Failure 6: The agent stated that the expec



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use VectorStore Retrieval to find information about Accenture's revenue in the third quarter of 2024.
Action: VectorStore Retrieval
Action Input: Accenture revenue Q3 2024[0m
Observation: [36;1m[1;3mThe revenue for Accenture in Q3 2024 is expected to be in the range of 16.1 billion to 16.7 billion, reflecting an estimated 3 to 7 percent growth in local currency.[0m
Thought:[32;1m[1;3mI have the information about Accenture's revenue in the third quarter of 2024.
Final Answer: The revenue for Accenture in Q3 2024 is estimated to be between 16.1 billion to 16.7 billion, with a growth of 3 to 7 percent in local currency.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to find specific revenue information for Accenture in the third and second quarters of 2024.
Action: VectorStore Retrieval
Action Input: revenue increase in third quarter compared to second quarter in