In [2]:
""" !pip install pytube
!pip install openai-whisper
!pip install langchain
!pip install openai
!pip install python-dotenv """

' !pip install pytube\n!pip install openai-whisper\n!pip install langchain\n!pip install openai\n!pip install python-dotenv '

In [4]:
""" !pip install -U langchain-community
!pip install pinecone-client
!pip install --upgrade pytube
!pip install yt-dlp
!pip install imageio[ffmpeg] """

' !pip install -U langchain-community\n!pip install pinecone-client\n!pip install --upgrade pytube\n!pip install yt-dlp\n!pip install imageio[ffmpeg] '

In [5]:
from torch import cuda, torch
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [None]:
from dotenv import load_dotenv, find_dotenv
import getpass
import os

# Load Environment Variable
_ = load_dotenv(find_dotenv())

# Set the environment variable directly in the script
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")

In [8]:
# Access the environment variable with os.environ
# print(os.environ["OPENAI_API_KEY"])

In [10]:
""" !pip install langsmith """

' !pip install langsmith '

In [11]:
# Import Langsmith (make sure it's installed first)
import langsmith  # Importing Langsmith's library
from langsmith import traceable
from langsmith.wrappers import wrap_openai
from langsmith import Client
from langsmith.evaluation import evaluate
from langsmith.schemas import Example, Run
import getpass
import os

In [12]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "prj-yt-chatbot-qa"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter LANGCHAIN_API_KEY")        # Authenticate with Langsmith

In [13]:
# Sunday 3-Nov - FINAL!
# STEP 1: Transcription and Consolidation Tasks (without agent to improve the process and reduce the time)

import os
import yt_dlp
from whisper import load_model
from langchain.agents import initialize_agent, Tool
from langchain.schema import HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI

# Default configurations
VIDEO_LINKS_FILE = "video_links.txt"
AUDIO_OUTPUT_TEMPLATE = "audio_{}"
WHISPER_MODEL = "base"
TRANSCRIPT_DIR = "transcriptions"

# Ensure directory exists for transcriptions
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)

# Load YouTube links from the file
def load_video_links(filename=VIDEO_LINKS_FILE):
    with open(filename, 'r') as file:
        return [link.strip() for link in file.readlines() if link.strip()]

# Download audio from YouTube video
def download_audio(url, output_name):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}],
        'outtmpl': f"{output_name}",
        'ffmpeg_location': '/usr/bin/ffmpeg'
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return f"{output_name}.mp3"

# Transcribe audio using Whisper
def transcribe_audio_with_whisper(audio_path, model_name=WHISPER_MODEL):
    model = load_model(model_name)
    result = model.transcribe(audio_path)
    return result["text"]

# Process each video, saving individual transcriptions
def transcribe_all_videos():
    video_links = load_video_links(VIDEO_LINKS_FILE)
    all_transcriptions = []

    for i, video_url in enumerate(video_links):
        print(f"Processing video {i+1}/{len(video_links)}: {video_url}")
        audio_path = download_audio(video_url, AUDIO_OUTPUT_TEMPLATE.format(i))
        
        # Transcribe in chunks to prevent reprocessing, saving each transcription file
        transcription = transcribe_audio_with_whisper(audio_path)
        transcript_file = os.path.join(TRANSCRIPT_DIR, f"transcription_{i+1}.txt")
        
        with open(transcript_file, 'w') as f:
            f.write(transcription)
        
        all_transcriptions.append({"title": f"Video {i+1}", "transcription": transcription})
        print(f"Completed transcription for video {i+1}")

    return all_transcriptions

# Consolidate transcriptions for LLM analysis
def consolidate_transcriptions():
    consolidated_text = ""
    for file in sorted(os.listdir(TRANSCRIPT_DIR)):
        file_path = os.path.join(TRANSCRIPT_DIR, file)
        
        # Check if the file_path is a file (not a directory)
        if os.path.isfile(file_path):
            with open(file_path, 'r') as f:
                consolidated_text += f.read() + "\n"
    return consolidated_text


# Execute process
if __name__ == "__main__":
    transcribe_all_videos()
    consolidated_text = consolidate_transcriptions()
    print("Consolidation Complete. Transcriptions are ready for vectorization and further processing.")

Processing video 1/8: https://www.youtube.com/watch?v=-RIfno4Sp0g
[youtube] Extracting URL: https://www.youtube.com/watch?v=-RIfno4Sp0g
[youtube] -RIfno4Sp0g: Downloading webpage
[youtube] -RIfno4Sp0g: Downloading ios player API JSON
[youtube] -RIfno4Sp0g: Downloading mweb player API JSON
[youtube] -RIfno4Sp0g: Downloading player dad5a960
[youtube] -RIfno4Sp0g: Downloading m3u8 information
[info] -RIfno4Sp0g: Downloading 1 format(s): 251
[download] Destination: audio_0
[download] 100% of   48.70MiB in 00:00:02 at 19.60MiB/s  
[ExtractAudio] Destination: audio_0.mp3
Deleting original file audio_0 (pass -k to keep)


100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 62.8MiB/s]


Completed transcription for video 1
Processing video 2/8: https://www.youtube.com/watch?v=3SCOVB1KDFc
[youtube] Extracting URL: https://www.youtube.com/watch?v=3SCOVB1KDFc
[youtube] 3SCOVB1KDFc: Downloading webpage
[youtube] 3SCOVB1KDFc: Downloading ios player API JSON
[youtube] 3SCOVB1KDFc: Downloading mweb player API JSON
[youtube] 3SCOVB1KDFc: Downloading player 81ef9024
[youtube] 3SCOVB1KDFc: Downloading m3u8 information
[info] 3SCOVB1KDFc: Downloading 1 format(s): 251
[download] Destination: audio_1
[download] 100% of   54.16MiB in 00:00:05 at 9.54MiB/s     
[ExtractAudio] Destination: audio_1.mp3
Deleting original file audio_1 (pass -k to keep)
Completed transcription for video 2
Processing video 3/8: https://www.youtube.com/watch?v=qMqP2IwhSnA
[youtube] Extracting URL: https://www.youtube.com/watch?v=qMqP2IwhSnA
[youtube] qMqP2IwhSnA: Downloading webpage
[youtube] qMqP2IwhSnA: Downloading ios player API JSON
[youtube] qMqP2IwhSnA: Downloading mweb player API JSON
[youtube] qMqP2

In [14]:
# Step 2: Chunk the consolidated text and Vectorization!

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

# Split consolidated text into manageable chunks
def split_text_for_vectorization(consolidated_text, max_tokens=1500):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_tokens, chunk_overlap=200)
    return text_splitter.split_text(consolidated_text)

# Function to perform vectorization on each chunk
def perform_vectorization_task(consolidated_text):
    chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.getenv("OPENAI_API_KEY"))
    memory = ConversationBufferMemory()
    
    # Initialize the embedding model and the agent with vectorization tool
    embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
    chunks = split_text_for_vectorization(consolidated_text)
    
    vectorized_segments = []
    for i, chunk in enumerate(chunks):
        # print(f"Vectorizing chunk {i+1}/{len(chunks)}")
        
        # Generate embeddings for this chunk
        embeddings = embedding_model.embed_documents([chunk])
        
        # Store each embedding
        vectorized_segments.append({
            "id": f"segment_{i}",
            "text": chunk,
            "embedding": embeddings[0]  # embedding_model.embed_documents returns a list
        })

    print("Vectorization complete. Ready for storage in a vector database.")
    return vectorized_segments

# Run vectorization task after consolidation
if __name__ == "__main__":    
    vectorized_segments = perform_vectorization_task(consolidated_text)


  chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.getenv("OPENAI_API_KEY"))
  memory = ConversationBufferMemory()
  embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")


Vectorization complete. Ready for storage in a vector database.


In [15]:
# STEP 3 - define Pinecone as Vector DB & Define Index & Embeddings.

from pinecone import Pinecone
import os
from getpass import getpass

# Securely fetch Pinecone API keys                    
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY") or getpass("Enter Pinecone API key: ")

In [17]:
#print(os.environ["PINECONE_API_KEY"])

In [18]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [20]:
# STEP 4: Create INDEX using EMBEDDINGS OpenAI-ada-002 - creating the Index for 'Accenture Earnings'

import os
import time
from getpass import getpass
from tqdm.auto import tqdm
from langchain.embeddings import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec

# Initialize OpenAI embeddings model
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.environ["OPENAI_API_KEY"])

# Initialize Pinecone client with the updated API
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
#spec = ServerlessSpec(cloud="aws", region="us-west-1")  # Specify Pinecone environment
index_name = "transcription-accnt-earn-index-2years"             # Create New Index

# Check if the index exists; create if it doesn’t
if index_name not in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=1536,  # OpenAI embedding dimension
        metric="dotproduct",
        spec=spec
    )
    # Wait for the index to be ready
    while not pc.describe_index(index_name).status["ready"]:
        print("Waiting for index to be ready...")
        time.sleep(1)

# Connect to the index for upsertion
index = pc.Index(index_name)
time.sleep(1)
# view index stats
print("Index statistics:", index.describe_index_stats())

Index statistics: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [21]:
# STEP 5 - UPSERT embeddings to Pinecone in batches

batch_size = 50
for i in tqdm(range(0, len(vectorized_segments), batch_size)):
    batch = vectorized_segments[i:i + batch_size]
    ids = [segment["id"] for segment in batch]
    embeds = [segment["embedding"] for segment in batch]
    metadatas = [{"text": segment["text"]} for segment in batch]

    # Upsert each batch to Pinecone
    try:
        index.upsert(vectors=list(zip(ids, embeds, metadatas)))
    except Exception as e:
        print(f"Error upserting batch {i // batch_size + 1}: {e}")

print("All segments upserted into Pinecone.")

  0%|          | 0/7 [00:00<?, ?it/s]

All segments upserted into Pinecone.


In [22]:
print("Index statistics:", index.describe_index_stats())

Index statistics: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 349}},
 'total_vector_count': 349}


In [24]:
#!pip install langchain-community

In [None]:
# Define ROUTER for query to be passed to VectorDB or Websearch (google search) -  Example to demonstrate

""" import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

# Initialize LLM in JSON mode (replace 'gpt-3.5-turbo' with any model configured for JSON output)
llm_json_mode = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])

# Define the routing instructions
router_instructions = """You are an expert at routing a user question to a vectorstore or web search.

The vectorstore contains documents related to Accenture company Earnings results, Revenue, Earnings for last four quarters, sales, cloud projects, Genai projects and employee details.

Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.

Return JSON with a single key, "datasource", that is either 'websearch' or 'vectorstore' depending on the question."""

# Function to route based on the instructions
def route_query(query):
    routing_response = llm_json_mode.invoke(
        [SystemMessage(content=router_instructions), HumanMessage(content=query)]
    )
    route_decision = json.loads(routing_response.content)
    return route_decision["datasource"]

# Example queries to test the router
test_queries = [
    "What is the revenue of Accenture last quarter?",
    "What is the EPS for last quarter?",
    "What is the total no of employee count in Accenture company?",
    "Who are the competitors of Accenture?",
]

# Execute the router on test queries
for query in test_queries:
    datasource = route_query(query)
    print(f"Query: {query}\nRouted to: {datasource}\n") """


Query: What is the revenue of Accenture last quarter?
Routed to: vectorstore

Query: What is the EPS for last quarter?
Routed to: vectorstore

Query: What is the total no of employee count in Accenture company?
Routed to: vectorstore

Query: Who are the competitors of Accenture?
Routed to: websearch



In [26]:
#!pip install google-search-results

## !pip install tavily-python (not needed)

In [27]:
import os
import getpass

#os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY") or getpass.getpass("Enter Tavily API key: ")

# Fetch the SerpAPI key securely
os.environ["SERPAPI_API_KEY"] = os.getenv("SERPAPI_API_KEY") or getpass.getpass("Enter SerpAPI key: ")

# Example usage in SerpAPI tool setup
serp_api_key = os.environ["SERPAPI_API_KEY"]

In [29]:
#print(os.environ["SERPAPI_API_KEY"])

In [30]:
# DEFINE ROUTER, RETRIEVE from VectorstoreDB and Retrieve from Google search functions!

import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from serpapi import GoogleSearch

import os

# Initialize the embedding model and vector store
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.environ["OPENAI_API_KEY"])
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])

#from langchain_community.tools.tavily_search import TavilySearchResults
# web_search_tool = TavilySearchResults(k=3)

# Assuming Pinecone index is already set up and embeddings are upserted
vector_store = Pinecone(index=index, embedding=embedding_model, text_key="text")

# Initialize the router instructions
router_instructions = """You are an expert at routing a user question to a vectorstore or web search.

The vectorstore contains documents related to Accenture company Quarterly results, Revenue, Earnings for the last four quarters, sales, digital projects, cloud projects and employee details.

Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.

Return JSON with a single key, "datasource", that is either 'websearch' or 'vectorstore' depending on the question."""

# Function to route query based on instructions
def route_query(query):
    routing_response = chat_model.invoke(
        [SystemMessage(content=router_instructions), HumanMessage(content=query)]
    )
    route_decision = json.loads(routing_response.content)
    return route_decision["datasource"]

# Function to retrieve from vector database
def retrieve_from_vectorstore(query):
    retriever = vector_store.as_retriever()
    results = retriever.get_relevant_documents(query)
    document_text = "\n\n".join([doc.page_content for doc in results])

    # Summarize the retrieved documents using LLM 
    prompt = (
        "Based on the following documents, provide a concise answer to the question:\n\n"
        f"Question: {query}\n\n"
        "Documents:\n"
        f"{document_text}\n\n"
        "Answer:"
    )
    
    answer_response = chat_model.invoke([HumanMessage(content=prompt)])
    return answer_response.content
    # return "\n\n".join([doc.page_content for doc in results]) # can be removed if this modules functions right!

# Function to retrieve from Google search using SerpaAPI
def web_search(query):
    search = GoogleSearch({
        "q": query,
        "api_key": serp_api_key, 
        "num": 3  # Limit to top 3 results for conciseness
    })
    results = search.get_dict()

    # Extract relevant text from search results
    combined_text = "\n".join(
        result.get("snippet", "No snippet available")
        for result in results.get("organic_results", [])
    )
    
    # Summarize results using the LLM
    prompt = (
        f"Summarize the following search results to answer the question:\n\n"
        f"Question: {query}\n\n"
        f"Results:\n{combined_text}\n\n"
        "Answer:"
    )
    summary_response = chat_model.invoke([HumanMessage(content=prompt)])
    return summary_response.content


# Function to perform web search (e.g., using Tavily - NOT USING IT)
""" def web_search(query):
    # Fetch results using Tavily
    results = web_search_tool.run(query)
    print("Results:", results)  # Inspect the structure
    combined_text = "\n".join(result.get('text', 'No text available') for result in results if 'text' in result)

    # Summarize results if multiple documents are returned
    prompt = (
        f"Summarize the following web search results to answer the question:\n\n"
        f"Question: {query}\n\n"
        f"Results:\n{combined_text}\n\n"
        "Answer:"
    )
    summary_response = chat_model.invoke([HumanMessage(content=prompt)])
    return summary_response.content """
    

# Main function to handle the query routing and retrieval
def handle_query(query):
    datasource = route_query(query)
    
    if datasource == "vectorstore":
        print("Routing to Vectorstore...")
        return retrieve_from_vectorstore(query)
    elif datasource == "websearch":
        print("Routing to Web Search...")
        return web_search(query)
    else:
        return "Unknown data source. Please check the router configuration."

# Example usage
query = "What is the revenue of Accenture last quarter??"
response = handle_query(query)
print("Response:", response)

query = "What is the EPS of Accenture last quarter?"
response = handle_query(query)
print("Response:", response)

query = "What is the change in Revenue between Fourth and Third quarters?"
response = handle_query(query)
print("Response:", response)

query = "Who is the main competitor of Accenture?"
response = handle_query(query)
print("Response:", response)


  vector_store = Pinecone(index=index, embedding=embedding_model, text_key="text")


Routing to Vectorstore...


  results = retriever.get_relevant_documents(query)


Response: Accenture's revenue last quarter was $16.4 billion.
Routing to Vectorstore...
Response: The EPS of Accenture last quarter was $3.19.
Routing to Vectorstore...
Response: The change in Revenue between Fourth and Third quarters was an increase.
Routing to Web Search...
Response: The main competitors of Accenture in the consulting industry include IBM, PwC, Tata Consultancy Services (TCS), Capgemini, Deloitte, Cognizant, SAP, DXC Technology, McKinsey & Co., Boston Consulting Group, Inc., and Bain & Co.


In [37]:
""" # Control the recursive depth to avoid Web_search error

import sys
sys.setrecursionlimit(2000)  # Limit to a reasonable depth """


' # Control the recursive depth to avoid Web_search error\n\nimport sys\nsys.setrecursionlimit(2000)  # Limit to a reasonable depth '

In [31]:
# DEFINE ALL THE TOOLS for ReACT agent! No CHATBOT YET!

from langchain.agents import initialize_agent
from langchain.agents import Tool
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
import os
from langchain.agents import initialize_agent
from serpapi import GoogleSearch

# Define the Web search tool using SerpAPI
web_search_tool = Tool(
    name="Google Search",
    func=web_search,
    description="Retrieve relevant information using google web search for general or current event questions."
)

# Define tool for vector store retrieval
vector_store_tool = Tool(
    name="VectorStore Retrieval",
    func=retrieve_from_vectorstore,
    description="Retrieve relevant documents from the vector store for questions about Accenture's earnings, revenue, sales, projects, and employee details.",
)

""" # Define tool for web search (Tavily - not Using it!)
web_search_tool = Tool(
    name="Web Search Retrieval",
    func=web_search,
    description="Retrieve information using Tavily web search for general or current event questions not covered in the vector store, particularly about competitors or industry updates.",
)
 """
# Initialize the ReAct agent with both tools
react_agent = initialize_agent(
    tools=[vector_store_tool, web_search_tool],
    agent="zero-shot-react-description",
    llm=chat_model,
    verbose=True,
    max_iterations=3  # Restrict maximum depth to avoid infinite loops
)

# Main function to handle the query using the ReAct agent
def handle_query_with_agent(query):
    # Call the ReAct agent with the query; it will decide which tool to use
    response = react_agent({"input": query})
    return response["output"]

# Example usage
query = "What is the revenue of Accenture fourth quarter of 2024?"
response = handle_query_with_agent(query)
print("Response:", response)

query = "What is the size of Genai project deals for Accenture in year 2024??"
response = handle_query_with_agent(query)
print("Response:", response)

query = "What is the EPS of Accenture in third quarter 2023?"
response = handle_query_with_agent(query)
print("Response:", response)


  react_agent = initialize_agent(
  response = react_agent({"input": query})




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should start by using the VectorStore Retrieval tool to see if there is specific information on Accenture's revenue for the fourth quarter of 2024. If I don't find anything, I can use Google Search to look for recent financial reports or news articles.
Action: VectorStore Retrieval
Action Input: Accenture revenue Q4 2024[0m
Observation: [36;1m[1;3mThe revenue for Accenture in Q4 2024 is $65 billion, representing 2% growth in local currency.[0m
Thought:[32;1m[1;3mI should also verify this information by using Google Search to see if there are any recent news articles or official reports confirming this revenue figure.
Action: Google Search
Action Input: Accenture Q4 2024 revenue[0m
Observation: [33;1m[1;3mAccenture's Q4 2024 revenue was $16.41 billion.[0m
Thought:[32;1m[1;3mThe information from the VectorStore Retrieval seems to be inaccurate. I should rely on the Google Search result for the most up-to-date reve

In [32]:
# INTERACTIVE CHAT FUNCTION using ReACT Agent - REFINED 3! Works perfect! TUESDAY

import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from langchain.memory import ConversationBufferMemory

# Initialize chat model and memory
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Interactive chat function with datasource-based source tagging
def chatbot_interaction():
    print("Welcome to the Accenture Q&A Chatbot! Ask any question about Accenture.")
    while True:
        user_query = input("You: ")
        if user_query.lower() in ["exit", "quit", "no"]:
            print("Agent: Thank you for using the Q&A chatbot. Goodbye!")
            break
        
        # Determine the datasource
        datasource = route_query(user_query)
        
        # Tag source based on datasource
        source_label = "VectorDB" if datasource == "vectorstore" else "Google Search" if datasource == "websearch" else "Unknown Source"
        
        try:
            # Get response from ReAct agent
            response = react_agent({"input": user_query})
            
            # Parse and display response with source tag
            if "output" in response and isinstance(response["output"], str):
                try:
                    parsed_response = json.loads(response["output"])
                    answer = parsed_response.get("answer", response["output"])
                    print(f"\nAgent (using {source_label}):\nQuestion: {user_query}\nAnswer: {answer}")
                except json.JSONDecodeError:
                    print("Agent: Error parsing response JSON, displaying raw answer instead.")
                    print(f"Agent (using {source_label}): {response['output']}")
            else:
                print("Agent: Unable to process the request in the expected format. Please try again.")
        
        except Exception as e:
            print(f"Error encountered: {e}")
            print("Agent: Unable to process the request. Please try again.")
        
        print("\nAgent: Do you have more questions?")

# Start the chatbot interaction
chatbot_interaction()



Welcome to the Accenture Q&A Chatbot! Ask any question about Accenture.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use the VectorStore Retrieval tool to find information on Accenture's earnings.
Action: VectorStore Retrieval
Action Input: Accenture EPS second quarter 2024[0m
Observation: [36;1m[1;3mIn the second quarter of fiscal 2024, Accenture reported adjusted earnings per share of $11.67, reflecting a 9% growth over the previous year. The company also exceeded its original guidance for capital allocation by returning $7.2 billion of cash to shareholders and investing approximately $2.5 billion across 25 acquisitions. Additionally, Accenture remains committed to delivering shareholder value and creating value for all stakeholders.[0m
Thought:[32;1m[1;3mI now know the final answer
Final Answer: In the second quarter of 2024, Accenture reported adjusted earnings per share of $11.67.[0m

[1m> Finished chain.[0m
Agent: Error parsing response JSON, dis

In [35]:
#!pip install gradio

In [None]:
# GRADIO only with TEXT - REFINED! Works Perfect!

import json
import openai
import gradio as gr

def gradio_chatbot(user_query):
    # Determine the source using the routing function
    datasource = route_query(user_query)
    
    # Tag source based on the datasource returned
    source_label = "VectorDB" if datasource == "vectorstore" else "Google Search" if datasource == "websearch" else "Unknown Source"
    
    try:
        # Get response from the ReAct agent
        response = react_agent({"input": user_query})

        # Check if the response is not empty and parse the response JSON if possible
        if response and "output" in response:
            output_content = response["output"]
            
            # Attempt to parse JSON response if available
            try:
                parsed_response = json.loads(output_content) if isinstance(output_content, str) else output_content
                answer = parsed_response.get("answer", output_content) if isinstance(parsed_response, dict) else output_content
            except json.JSONDecodeError:
                answer = output_content  # Fallback to raw output if JSON parsing fails

            return f"Answer (from {source_label}):\n{answer}"

        else:
            return "Error: Received an empty or unexpected response format from the agent."

    except Exception as e:
        # Capture any other errors
        return f"Error encountered: {str(e)}"

# Create the Gradio Interface
iface = gr.Interface(
    fn=gradio_chatbot,  # Function to call
    inputs="text",      # Input type: text
    outputs="text",     # Output type: text
    title="Accenture Q&A Chatbot",
    description="Ask any question about Accenture and get answers with sourced information."
)

# Launch the Gradio app
iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://25b2cc2a541bd76cc9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)






[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to find specific financial information for Accenture's revenue in the second quarter of 2023.
Action: VectorStore Retrieval
Action Input: "Accenture revenue Q2 2023"[0m
Observation: [36;1m[1;3mIn Q2 2023, Accenture expects revenues to be in the range of 16.1 billion to 16.7 billion, with an estimated 3 to 7% growth in local currency. For the full fiscal year 23, Accenture expects revenue to be in the range of 8 to 10% growth in local currency, with an inorganic contribution of 2%.[0m
Thought:[32;1m[1;3mThe revenue in the second quarter of 2023 for Accenture is estimated to be in the range of 16.1 billion to 16.7 billion, with an estimated growth of 3 to 7% in local currency.
Final Answer: The revenue in the second quarter of 2023 for Accenture is estimated to be in the range of 16.1 billion to 16.7 billion.[0m

[1m> Finished chain.[0m


In [None]:
# GISKARD EVALUATION BELOW !!!

#!pip install --default-timeout=100 "giskard[llm]"
from giskard.rag import KnowledgeBase, generate_testset, evaluate


2024-11-05 21:32:12.617819: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-05 21:32:12.617915: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-05 21:32:12.619119: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-05 21:32:12.626330: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [40]:
import pandas as pd

# Define the sample questions to test with `handle_query`
# Define the list of queries to retrieve answers for
queries = [
    "What is the revenue of Accenture in fourth quarter of 2024?",
    "What is the EPS of Accenture in fourth quarter of 2024?",
    "Who is the main competitor of Accenture?",
    "What is the size of Genai project deals for Accenture in year 2024?",
    "How many employees does Accenture have worldwide?"
]

# Collect answers by running each query through the handle_query function
answers = [handle_query(query) for query in queries]

# Create a DataFrame with only the answers
df_giskard = pd.DataFrame(answers, columns=["text"])

# Display the DataFrame to verify
df_giskard.head()


Routing to Vectorstore...
Routing to Web Search...
Routing to Web Search...
Routing to Web Search...
Routing to Web Search...


Unnamed: 0,text
0,The revenue of Accenture in the fourth quarter...
1,The EPS of Accenture in the fourth quarter of ...
2,The main competitors of Accenture in the consu...
3,Accenture booked $900 million in generative AI...
4,"Accenture has approximately 774,000 employees ..."


In [41]:
# Build KNOWLEDGEBASE for Giskard

kb_giskard = KnowledgeBase(df_giskard)

In [43]:
# Test Question using Giskard

test_questions = generate_testset(
    kb_giskard,
    num_questions=15,
    agent_description="Accenture Q&A chat assistant related to Quarterly Earnings, and general information like competition.",
)

Generating questions:   0%|          | 0/15 [00:00<?, ?it/s]

In [44]:
df_test_questions = test_questions.to_pandas()
df_test_questions.head()

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
d4a08da1-e913-47b1-9bf4-f6ba5ca90504,Who are the main competitors of Accenture in t...,The main competitors of Accenture in the consu...,Document 2: The main competitors of Accenture ...,[],"{'question_type': 'simple', 'seed_document_id'..."
ad126300-2c0e-4fef-bcdd-27a539d7f4f6,What was the EPS of Accenture in the fourth qu...,The EPS of Accenture in the fourth quarter of ...,Document 1: The EPS of Accenture in the fourth...,[],"{'question_type': 'simple', 'seed_document_id'..."
faeb8cd3-8366-4673-81d7-c8bea9b4804c,How much did Accenture book in generative AI b...,Accenture booked $900 million in generative AI...,Document 3: Accenture booked $900 million in g...,[],"{'question_type': 'simple', 'seed_document_id'..."
05cdb12f-d525-4922-a74b-7e9fe7a2b2b2,Could you provide me with the Earnings Per Sha...,The EPS of Accenture in the fourth quarter of ...,Document 0: The revenue of Accenture in the fo...,[],"{'question_type': 'complex', 'seed_document_id..."
732e0592-6400-4d39-b611-812f7de43aa2,Could you provide me with the anticipated rang...,The expected revenue of Accenture in the fourt...,Document 0: The revenue of Accenture in the fo...,[],"{'question_type': 'complex', 'seed_document_id..."


In [45]:
def use_agent(question, history=None):
    response = react_agent({"input": question}) # Call the react_agent with question input
    return response["output"]

In [46]:
report = evaluate(use_agent, testset=test_questions, knowledge_base=kb_giskard)

Asking questions to the agent:   0%|          | 0/15 [00:00<?, ?it/s]



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use Google Search to find this information.
Action: Google Search
Action Input: "Accenture main competitors in consulting industry"[0m
Observation: [33;1m[1;3mAccenture's main competitors in the consulting industry include Infosys, IBM, PwC, Tata Consultancy Services (TCS), Capgemini, Deloitte, Cognizant, SAP, DXC Technology, McKinsey & Co., Boston Consulting Group, Inc., and Bain & Co.[0m
Thought:[32;1m[1;3mI now have the list of Accenture's main competitors in the consulting industry.
Final Answer: Accenture's main competitors in the consulting industry include Infosys, IBM, PwC, Tata Consultancy Services (TCS), Capgemini, Deloitte, Cognizant, SAP, DXC Technology, McKinsey & Co., Boston Consulting Group, Inc., and Bain & Co.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use VectorStore Retrieval to find information about Accenture's earnings.
Action: Vect

CorrectnessMetric evaluation:   0%|          | 0/15 [00:00<?, ?it/s]

In [47]:
# Summary with the results.
report.correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,0.0
conversational,0.5
distracting element,0.0
double,0.0
simple,0.333333
situational,0.5


In [54]:
# Obtaining the incorrect answers
failures = report.get_failures()[:15]
failures

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata,agent_answer,correctness,correctness_reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ad126300-2c0e-4fef-bcdd-27a539d7f4f6,What was the EPS of Accenture in the fourth qu...,The EPS of Accenture in the fourth quarter of ...,Document 1: The EPS of Accenture in the fourth...,[],"{'question_type': 'simple', 'seed_document_id'...",Agent stopped due to iteration limit or time l...,False,The agent did not provide any answer to the us...
faeb8cd3-8366-4673-81d7-c8bea9b4804c,How much did Accenture book in generative AI b...,Accenture booked $900 million in generative AI...,Document 3: Accenture booked $900 million in g...,[],"{'question_type': 'simple', 'seed_document_id'...",Accenture booked $3 billion in generative AI b...,False,The agent stated that Accenture booked $3 bill...
05cdb12f-d525-4922-a74b-7e9fe7a2b2b2,Could you provide me with the Earnings Per Sha...,The EPS of Accenture in the fourth quarter of ...,Document 0: The revenue of Accenture in the fo...,[],"{'question_type': 'complex', 'seed_document_id...",The EPS for Accenture during the final quarter...,False,The agent stated that the EPS for Accenture du...
732e0592-6400-4d39-b611-812f7de43aa2,Could you provide me with the anticipated rang...,The expected revenue of Accenture in the fourt...,Document 0: The revenue of Accenture in the fo...,[],"{'question_type': 'complex', 'seed_document_id...",Accenture is projected to generate revenue gro...,False,The agent provided a percentage growth in reve...
69b3e5c3-3b53-493c-bc76-47ab6c509dc2,What was the recorded revenue for Accenture's ...,Accenture booked $900 million in generative AI...,Document 3: Accenture booked $900 million in g...,[],"{'question_type': 'complex', 'seed_document_id...",The recorded revenue for Accenture's generativ...,False,The agent stated that the recorded revenue for...
7a48e5e2-d9ee-4f4a-8318-d4a9dc9689bd,What was the EPS of Accenture in the fourth qu...,The EPS of Accenture in the fourth quarter of ...,Document 0: The revenue of Accenture in the fo...,[],"{'question_type': 'distracting element', 'seed...",Accenture's EPS in the fourth quarter of 2024 ...,False,The agent stated that Accenture's EPS in the f...
3cca6140-0f19-40ee-8f36-258ad5275253,What was the EPS of Accenture in the fourth qu...,The EPS of Accenture in the fourth quarter of ...,Document 1: The EPS of Accenture in the fourth...,[],"{'question_type': 'distracting element', 'seed...",Agent stopped due to iteration limit or time l...,False,The agent did not provide any answer to the us...
84bbf30a-6729-4d13-8dbe-85b48d56939c,Considering the expected revenue range of $15....,The EPS of Accenture in the fourth quarter of ...,Document 1: The EPS of Accenture in the fourth...,[],"{'question_type': 'distracting element', 'seed...",Agent stopped due to iteration limit or time l...,False,The agent did not provide any answer to the us...
71d20f12-27b2-4080-a549-de474ea3755f,"As an investor, I'm trying to understand Accen...",The EPS of Accenture in the fourth quarter of ...,Document 1: The EPS of Accenture in the fourth...,[],"{'question_type': 'situational', 'seed_documen...",Accenture's EPS in the fourth quarter of 2024 ...,False,The agent stated that Accenture's EPS in the f...
c7d4310e-1030-41cc-8404-dd79cf7964c3,What is the expected revenue range and the EPS...,The revenue of Accenture in the fourth quarter...,Document 0: The revenue of Accenture in the fo...,[],"{'question_type': 'double', 'original_question...",The expected revenue range is $16.1 billion to...,False,The agent stated that the expected revenue ran...


In [59]:
# Giskard explains the reasons why it considers the answers to be incorrect.
failures['correctness_reason'].iloc[8]

"The agent stated that Accenture's EPS in the fourth quarter of 2024 was $11.67, but the correct answer is $2.66."

In [None]:
# !pip install openai==0.27.0
# !pip install whisper # Install Whisper without OpenAI requirement!

Collecting whisper
  Downloading whisper-1.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: whisper
  Building wheel for whisper (setup.py) ... [?25ldone
[?25h  Created wheel for whisper: filename=whisper-1.1.10-py3-none-any.whl size=41121 sha256=bf6e3521a837bc3321020c65c120c8e8561aa32931ec8168c97d741b37779bd8
  Stored in directory: /root/.cache/pip/wheels/21/65/ee/4e6672aabfa486d3341a39a04f8f87c77e5156149299b5a7d0
Successfully built whisper
Installing collected packages: whisper
Successfully installed whisper-1.1.10
[0m

In [None]:
# Working with local Whisper (/*** NOT using OpenAPI -  Works fine ***/

import whisper
import gradio as gr

# Load Whisper model
model = whisper.load_model("base")  # You can choose "small", "medium", etc.

# Function to transcribe audio locally using Whisper
def transcribe_audio_with_whisper(audio_path):
    result = model.transcribe(audio_path)
    return result["text"]

# Gradio Interface for Audio Transcription
iface = gr.Interface(
    fn=transcribe_audio_with_whisper,
    inputs=gr.Audio(type="filepath", label="Record your question here"),
    outputs="text",
    title="Audio to Text Transcription",
    description="Record an audio query and get the transcribed text using Whisper."
)

iface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7869
* Running on public URL: https://b306e0e2f8f7467908.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [49]:
# Transcribe Audio ONLY!

import openai
import gradio as gr

# Function to transcribe audio using Whisper API
def transcribe_audio(audio_path):
    try:
        with open(audio_path, "rb") as audio_file:
            # Replace "whisper-1" with the appropriate Whisper model if needed
            response = openai.Audio.transcribe("whisper-1", audio_file)
            transcription = response.get("text", "Error: No transcription available.")
            return transcription
    except Exception as e:
        return f"Error during transcription: {e}"

# Gradio Interface for Audio Transcription Only
iface = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(type="filepath", label="Record your question here"),
    outputs="text",
    title="Audio to Text Transcription",
    description="Record an audio query and get the transcribed text using Whisper."
)

iface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7868
* Running on public URL: https://1392029650c18d30ad.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# Gradio with Audio input/output

import gradio as gr
import openai
import json
import pyttsx3
from langchain.chat_models import ChatOpenAI

# Transcription function for audio input using Whisper API
def transcribe_audio_with_whisper(audio_path):
    try:
        with open(audio_path, "rb") as audio_file:
            response = openai.Audio.transcribe("whisper-1", audio_file)
            transcription = response["text"]
            return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return "Transcription Error"

# Text-to-speech function for generating audio output
def text_to_speech(text):
    tts = pyttsx3.init()
    tts.save_to_file(text, 'response_audio.mp3')
    return 'response_audio.mp3'

# Main function to handle both text and audio input
def gradio_chatbot(user_query=None, audio_query=None):
    if audio_query is not None:
        user_query = transcribe_audio_with_whisper(audio_query)
        
    if not user_query:
        return "Error: No valid input provided. Please either type or speak your question."
    
    # Use the chatbot with LangChain’s ReAct agent setup
    datasource = route_query(user_query)
    source_label = "VectorDB" if datasource == "vectorstore" else "Google Search" if datasource == "websearch" else "Unknown Source"
    response = handle_query_with_agent(user_query)

    # Generate audio response and return both text and audio
    audio_output_path = text_to_speech(response)
    return f"Answer (from {source_label}):\n{response}", audio_output_path

# Gradio interface with both text and audio inputs and outputs
iface = gr.Interface(
    fn=gradio_chatbot,
    inputs=[
        gr.Textbox(label="Type your question here"),          # Text input
        gr.Audio(type="filepath", label="Or speak your question here")  # Audio input
    ],
    outputs=[
        "text",                      # Text output for answer
        gr.Audio(label="Audio Answer")   # Audio output for TTS response
    ],
    title="Accenture Q&A Chatbot",
    description="Ask any question about Accenture and get answers with sourced information, through text or voice."
)

iface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7864
* Running on public URL: https://529a52f742182e5eec.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)






[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use VectorStore Retrieval to find the revenue information for Accenture's fourth quarter of 2024.
Action: VectorStore Retrieval
Action Input: Accenture revenue Q4 2024[0m
Observation: [36;1m[1;3mAccenture reported revenues of $16.4 billion in Q4 2024, representing a 3% increase in US dollars and a 5% increase in local currency.[0m
Thought:[32;1m[1;3mI now have the final answer.
Final Answer: Accenture reported revenues of $16.4 billion in the fourth quarter of 2024.[0m

[1m> Finished chain.[0m


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pyttsx3/__init__.py", line 20, in init
    eng = _activeEngines[driverName]
          ~~~~~~~~~~~~~~^^^^^^^^^^^^
  File "/usr/lib/python3.11/weakref.py", line 136, in __getitem__
    o = self.data[key]()
        ~~~~~~~~~^^^^^
KeyError: None

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/gradio/queueing.py", line 624, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/route_utils.py", line 323, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 2015, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^


Error during transcription: 

You tried to access openai.Audio, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use VectorStore Retrieval to find information about transcription errors in Accenture's earnings reports.
Action: VectorStore Retrieval
Action Input: Transcription errors in Accenture's earnings reports[0m
Observation: [36;1m[1;3mAccenture experienced transcription errors in its earnings reports, with discrepancies in the number of employees mentioned in the documents provided.[0m
Thought:[32;1m[1;3mI should find out more details abou

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pyttsx3/__init__.py", line 20, in init
    eng = _activeEngines[driverName]
          ~~~~~~~~~~~~~~^^^^^^^^^^^^
  File "/usr/lib/python3.11/weakref.py", line 136, in __getitem__
    o = self.data[key]()
        ~~~~~~~~~^^^^^
KeyError: None

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/gradio/queueing.py", line 624, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/route_utils.py", line 323, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 2015, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^


In [None]:
# New Code 11PM SATURDAY # THIS CODE worked using ReAct agent for Transcription, but went in Infinite LOOOOP

""" import os
import yt_dlp
from whisper import load_model
from langchain.agents import initialize_agent, Tool
from langchain.schema import HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

# Default configurations
VIDEO_LINKS_FILE = "video_links.txt"  # File with YouTube links
AUDIO_OUTPUT_TEMPLATE = "audio_{}"  # Template for audio file names without .mp3 extension
WHISPER_MODEL = "base"  # Whisper model for transcription

# Load YouTube links from the file
def load_video_links(filename=VIDEO_LINKS_FILE):
    with open(filename, 'r') as file:
        video_links = [link.strip() for link in file.readlines() if link.strip()]
    return video_links

# Download audio from YouTube video
def download_audio(url, output_name):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [
            {
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }
        ],
        'outtmpl': output_name,  # Filename with .mp3 extension
        'ffmpeg_location': '/usr/bin/ffmpeg'
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return f"{output_name}.mp3"

# Transcribe audio using Whisper
def transcribe_audio_with_whisper(audio_path, model_name=WHISPER_MODEL):
    model = load_model(model_name)
    result = model.transcribe(audio_path)
    return result["text"]

# ReAct Tool for downloading and transcribing all videos
# Split transcriptions into chunks below the token limit - Process in Batches!
def chunk_text(text, max_length=1500):
    words = text.split()
    chunks = []
    chunk = []
    for word in words:
        chunk.append(word)
        if len(" ".join(chunk)) >= max_length:
            chunks.append(" ".join(chunk))
            chunk = []
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

# Process each transcription in chunks
def transcribe_all_videos(input_data=None): # Accept an optional argument
    video_links = load_video_links(VIDEO_LINKS_FILE)
    all_transcriptions = []
    
    for i, video_url in enumerate(video_links):
        print(f"Processing video {i+1}/{len(video_links)}: {video_url}")
        audio_path = download_audio(video_url, AUDIO_OUTPUT_TEMPLATE.format(i))
        full_transcription = transcribe_audio_with_whisper(audio_path)
        transcription_chunks = chunk_text(full_transcription)  # Divide transcription
        
        for j, chunk in enumerate(transcription_chunks):
            # print(f"Processing chunk {j+1} of video {i+1}")
            # Send chunk to the ReAct agent
            # response = react_agent({"input": chunk}) /* Don't call Agent each time */
            all_transcriptions.append({"title": f"Video {i+1} - Chunk {j+1}", "transcription": chunk})
            print(f"Completed chunk {j+1} for video {i+1}")
        
    return all_transcriptions


def transcribe_all_videos(input_data=None):  # Accept an optional argument
    video_links = load_video_links(VIDEO_LINKS_FILE)
    all_transcriptions = []
    for i, video_url in enumerate(video_links):
        print(f"Processing video {i+1}/{len(video_links)}: {video_url}")
        audio_path = download_audio(video_url, AUDIO_OUTPUT_TEMPLATE.format(i))
        transcription = transcribe_audio_with_whisper(audio_path)
        all_transcriptions.append({"title": f"Video {i+1}", "transcription": transcription})
        print(f"Completed transcription for video {i+1}")
    return all_transcriptions

# Define structured output for better organization
response_schema = [
    ResponseSchema(name="transcription", description="Transcribed text from each video in structured format.")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schema)

# Define tool for the agent
transcription_tool = Tool(
    name="TranscribeAllVideos",
    func=transcribe_all_videos,
    description="Transcribes all videos in the provided list and consolidates results."
)

# Initialize the ReAct agent with transcription tool
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
memory = ConversationBufferMemory()

react_agent = initialize_agent(
    tools=[transcription_tool],
    agent="zero-shot-react-description",
    llm=chat_model,
    memory=memory,
    verbose=True
)

# Run agent to perform transcription
def perform_transcription_task():
    user_query = "Transcribe all earnings podcasts from the provided YouTube links."
    transcriptions = transcribe_all_videos()  # Get transcriptions directly
    response = react_agent({"input": user_query, "transcriptions": transcriptions})  # Send entire list to agent once
    print("Agent's Transcription:", response["output"])

def perform_transcription_task():
    user_query = "Transcribe all earnings podcasts from the provided YouTube links."
    response = react_agent({"input": user_query})
    print("Agent's Transcription:", response["output"])

# Start the transcription task
perform_transcription_task()
 """

  chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
  memory = ConversationBufferMemory()
  react_agent = initialize_agent(


Processing video 1/4: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] Extracting URL: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] 089JqnNmJq0: Downloading webpage
[youtube] 089JqnNmJq0: Downloading ios player API JSON
[youtube] 089JqnNmJq0: Downloading mweb player API JSON
[youtube] 089JqnNmJq0: Downloading player 4e23410d
[youtube] 089JqnNmJq0: Downloading m3u8 information
[info] 089JqnNmJq0: Downloading 1 format(s): 251
[download] Destination: audio_0
[download] 100% of   49.02MiB in 00:00:01 at 43.39MiB/s    
[ExtractAudio] Destination: audio_0.mp3
Deleting original file audio_0 (pass -k to keep)


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 181MiB/s]


Processing chunk 1 of video 1
Completed chunk 1 for video 1
Processing chunk 2 of video 1
Completed chunk 2 for video 1
Processing chunk 3 of video 1
Completed chunk 3 for video 1
Processing chunk 4 of video 1
Completed chunk 4 for video 1
Processing chunk 5 of video 1
Completed chunk 5 for video 1
Processing chunk 6 of video 1
Completed chunk 6 for video 1
Processing chunk 7 of video 1
Completed chunk 7 for video 1
Processing chunk 8 of video 1
Completed chunk 8 for video 1
Processing chunk 9 of video 1
Completed chunk 9 for video 1
Processing chunk 10 of video 1
Completed chunk 10 for video 1
Processing chunk 11 of video 1
Completed chunk 11 for video 1
Processing chunk 12 of video 1
Completed chunk 12 for video 1
Processing chunk 13 of video 1
Completed chunk 13 for video 1
Processing chunk 14 of video 1
Completed chunk 14 for video 1
Processing chunk 15 of video 1
Completed chunk 15 for video 1
Processing chunk 16 of video 1
Completed chunk 16 for video 1
Processing chunk 17 of vid

  response = react_agent({"input": user_query, "transcriptions": transcriptions})  # Send entire list to agent once


[32;1m[1;3mI should transcribe all the earnings podcasts from the YouTube links provided to consolidate the results.
Action: TranscribeAllVideos
Action Input: List of YouTube links to earnings podcasts[0mProcessing video 1/4: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] Extracting URL: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] 089JqnNmJq0: Downloading webpage
[youtube] 089JqnNmJq0: Downloading ios player API JSON
[youtube] 089JqnNmJq0: Downloading mweb player API JSON
[youtube] 089JqnNmJq0: Downloading m3u8 information
[info] 089JqnNmJq0: Downloading 1 format(s): 251
[download] Destination: audio_0
[download] 100% of   49.02MiB in 00:00:01 at 43.23MiB/s    
[ExtractAudio] Destination: audio_0.mp3
Deleting original file audio_0 (pass -k to keep)
Processing chunk 1 of video 1
Completed chunk 1 for video 1
Processing chunk 2 of video 1
Completed chunk 2 for video 1
Processing chunk 3 of video 1
Completed chunk 3 for video 1
Processing chunk 4 of video 1
Completed c

KeyboardInterrupt: 