In [2]:
""" !pip install pytube
!pip install openai-whisper
!pip install langchain
!pip install openai
!pip install python-dotenv """

' !pip install pytube\n!pip install openai-whisper\n!pip install langchain\n!pip install openai\n!pip install python-dotenv '

In [4]:
""" !pip install -U langchain-community
!pip install pinecone-client
!pip install --upgrade pytube
!pip install yt-dlp
!pip install imageio[ffmpeg] """

' !pip install -U langchain-community\n!pip install pinecone-client\n!pip install --upgrade pytube\n!pip install yt-dlp\n!pip install imageio[ffmpeg] '

In [5]:
from torch import cuda, torch
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [10]:
from dotenv import load_dotenv, find_dotenv
import getpass
import os

# Load Environment Variable
_ = load_dotenv(find_dotenv())

# Set the environment variable directly in the script
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")

#OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')
# initialize connection to pinecone (get API key at app.pinecone.io)
#PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") # or "YOUR_API_KEY"

In [12]:
# Access the environment variable with os.environ
# print(os.environ["OPENAI_API_KEY"])


In [15]:
# Sunday 3-Nov
# STEP 1: Transcription and Consolidation Tasks (without agent to improve the process and reduce the time)

import os
import yt_dlp
from whisper import load_model
from langchain.agents import initialize_agent, Tool
from langchain.schema import HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI

# Default configurations
VIDEO_LINKS_FILE = "video_links.txt"
AUDIO_OUTPUT_TEMPLATE = "audio_{}"
WHISPER_MODEL = "base"
TRANSCRIPT_DIR = "transcriptions"

# Ensure directory exists for transcriptions
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)

# Load YouTube links from the file
def load_video_links(filename=VIDEO_LINKS_FILE):
    with open(filename, 'r') as file:
        return [link.strip() for link in file.readlines() if link.strip()]

# Download audio from YouTube video
def download_audio(url, output_name):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}],
        'outtmpl': f"{output_name}",
        'ffmpeg_location': '/usr/bin/ffmpeg'
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return f"{output_name}.mp3"

# Transcribe audio using Whisper
def transcribe_audio_with_whisper(audio_path, model_name=WHISPER_MODEL):
    model = load_model(model_name)
    result = model.transcribe(audio_path)
    return result["text"]

# Process each video, saving individual transcriptions
def transcribe_all_videos():
    video_links = load_video_links(VIDEO_LINKS_FILE)
    all_transcriptions = []

    for i, video_url in enumerate(video_links):
        print(f"Processing video {i+1}/{len(video_links)}: {video_url}")
        audio_path = download_audio(video_url, AUDIO_OUTPUT_TEMPLATE.format(i))
        
        # Transcribe in chunks to prevent reprocessing, saving each transcription file
        transcription = transcribe_audio_with_whisper(audio_path)
        transcript_file = os.path.join(TRANSCRIPT_DIR, f"transcription_{i+1}.txt")
        
        with open(transcript_file, 'w') as f:
            f.write(transcription)
        
        all_transcriptions.append({"title": f"Video {i+1}", "transcription": transcription})
        print(f"Completed transcription for video {i+1}")

    return all_transcriptions

# Consolidate transcriptions for LLM analysis
def consolidate_transcriptions():
    consolidated_text = ""
    for file in sorted(os.listdir(TRANSCRIPT_DIR)):
        file_path = os.path.join(TRANSCRIPT_DIR, file)
        
        # Check if the file_path is a file (not a directory)
        if os.path.isfile(file_path):
            with open(file_path, 'r') as f:
                consolidated_text += f.read() + "\n"
    return consolidated_text


# Execute process
if __name__ == "__main__":
    transcribe_all_videos()
    consolidated_text = consolidate_transcriptions()
    print("Consolidation Complete. Transcriptions are ready for vectorization and further processing.")

Processing video 1/4: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] Extracting URL: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] 089JqnNmJq0: Downloading webpage
[youtube] 089JqnNmJq0: Downloading ios player API JSON
[youtube] 089JqnNmJq0: Downloading mweb player API JSON
[youtube] 089JqnNmJq0: Downloading m3u8 information
[info] 089JqnNmJq0: Downloading 1 format(s): 251
[download] Destination: audio_0
[download] 100% of   49.02MiB in 00:00:01 at 41.95MiB/s    
[ExtractAudio] Destination: audio_0.mp3
Deleting original file audio_0 (pass -k to keep)
Completed transcription for video 1
Processing video 2/4: https://www.youtube.com/watch?v=IskUO7MoV2s
[youtube] Extracting URL: https://www.youtube.com/watch?v=IskUO7MoV2s
[youtube] IskUO7MoV2s: Downloading webpage
[youtube] IskUO7MoV2s: Downloading ios player API JSON
[youtube] IskUO7MoV2s: Downloading mweb player API JSON
[youtube] IskUO7MoV2s: Downloading m3u8 information
[info] IskUO7MoV2s: Downloading 1 format(s): 251
[

In [None]:
# Step 2: Chunk the consolidated text and Vectorization!

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

# Split consolidated text into manageable chunks
def split_text_for_vectorization(consolidated_text, max_tokens=1500):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_tokens, chunk_overlap=200)
    return text_splitter.split_text(consolidated_text)

# Function to perform vectorization on each chunk
def perform_vectorization_task(consolidated_text):
    chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.getenv("OPENAI_API_KEY"))
    memory = ConversationBufferMemory()
    
    # Initialize the embedding model and the agent with vectorization tool
    embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
    chunks = split_text_for_vectorization(consolidated_text)
    
    vectorized_segments = []
    for i, chunk in enumerate(chunks):
        # print(f"Vectorizing chunk {i+1}/{len(chunks)}")
        
        # Generate embeddings for this chunk
        embeddings = embedding_model.embed_documents([chunk])
        
        # Store each embedding
        vectorized_segments.append({
            "id": f"segment_{i}",
            "text": chunk,
            "embedding": embeddings[0]  # embedding_model.embed_documents returns a list
        })

    print("Vectorization complete. Ready for storage in a vector database.")
    return vectorized_segments

# Run vectorization task after consolidation
if __name__ == "__main__":    
    vectorized_segments = perform_vectorization_task(consolidated_text)


  chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.getenv("OPENAI_API_KEY"))
  memory = ConversationBufferMemory()
  embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")


Vectorization complete. Ready for storage in a vector database.


In [17]:
# STEP 3 - define Pinecone as Vector DB & Define Index & Embeddings.

from pinecone import Pinecone
import os
from getpass import getpass

# Securely fetch OpenAI and Pinecone API keys                    
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY") or getpass("Enter Pinecone API key: ")
#os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") or getpass("Enter OpenAI API key: ")

In [19]:
#print(os.environ["PINECONE_API_KEY"])

In [21]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [22]:
# STEP 4: Create INDEX using EMBEDDINGS OpenAI-ada-002 - creating the Index for 'Accenture Earnings'

import os
import time
from getpass import getpass
from tqdm.auto import tqdm
from langchain.embeddings import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec

# Initialize OpenAI embeddings model
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.environ["OPENAI_API_KEY"])

# Initialize Pinecone client with the updated API
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
#spec = ServerlessSpec(cloud="aws", region="us-west-1")  # Specify Pinecone environment
index_name = "transcription-accnt-earn-index-2"             # Create New Index

# Check if the index exists; create if it doesn’t
if index_name not in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=1536,  # OpenAI embedding dimension
        metric="dotproduct",
        spec=spec
    )
    # Wait for the index to be ready
    while not pc.describe_index(index_name).status["ready"]:
        print("Waiting for index to be ready...")
        time.sleep(1)

# Connect to the index for upsertion
index = pc.Index(index_name)
time.sleep(1)
# view index stats
print("Index statistics:", index.describe_index_stats())

Index statistics: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [23]:
# STEP 5 - UPSERT embeddings to Pinecone in batches

batch_size = 50
for i in tqdm(range(0, len(vectorized_segments), batch_size)):
    batch = vectorized_segments[i:i + batch_size]
    ids = [segment["id"] for segment in batch]
    embeds = [segment["embedding"] for segment in batch]
    metadatas = [{"text": segment["text"]} for segment in batch]

    # Upsert each batch to Pinecone
    try:
        index.upsert(vectors=list(zip(ids, embeds, metadatas)))
    except Exception as e:
        print(f"Error upserting batch {i // batch_size + 1}: {e}")

print("All segments upserted into Pinecone.")


  0%|          | 0/4 [00:00<?, ?it/s]

All segments upserted into Pinecone.


In [25]:
print("Index statistics:", index.describe_index_stats())

Index statistics: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 175}},
 'total_vector_count': 175}


In [27]:
#!pip install langchain-community

In [None]:
# Define ROUTER for query to be passed to VectorDB or Websearch (google search) -  Example to demonstrate

import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

# Initialize LLM in JSON mode (replace 'gpt-3.5-turbo' with any model configured for JSON output)
llm_json_mode = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])

# Define the routing instructions
router_instructions = """You are an expert at routing a user question to a vectorstore or web search.

The vectorstore contains documents related to Accenture company Earnings results, Revenue, Earnings for last four quarters, sales, cloud projects, Genai projects and employee details.

Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.

Return JSON with a single key, "datasource", that is either 'websearch' or 'vectorstore' depending on the question."""

# Function to route based on the instructions
def route_query(query):
    routing_response = llm_json_mode.invoke(
        [SystemMessage(content=router_instructions), HumanMessage(content=query)]
    )
    route_decision = json.loads(routing_response.content)
    return route_decision["datasource"]

# Example queries to test the router
test_queries = [
    "What is the revenue of Accenture last quarter?",
    "What is the EPS for last quarter?",
    "What is the total no of employee count in Accenture company?",
    "What is the size of Genai project deals for Accenture this year?",
]

# Execute the router on test queries
for query in test_queries:
    datasource = route_query(query)
    print(f"Query: {query}\nRouted to: {datasource}\n")


Query: What is the revenue of Accenture last quarter?
Routed to: vectorstore

Query: What is the EPS for last quarter?
Routed to: vectorstore

Query: What is the total no of employee count in Accenture company?
Routed to: vectorstore

Query: What is the size of Genai project deals for Accenture this year?
Routed to: vectorstore



In [30]:
## !pip install tavily-python (not needed)
#!pip install google-search-results

In [31]:
import os
import getpass

#os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY") or getpass.getpass("Enter Tavily API key: ")

# Fetch the SerpAPI key securely
os.environ["SERPAPI_API_KEY"] = os.getenv("SERPAPI_API_KEY") or getpass.getpass("Enter SerpAPI key: ")

# Example usage in SerpAPI tool setup
serp_api_key = os.environ["SERPAPI_API_KEY"]

In [33]:
#print(os.environ["SERPAPI_API_KEY"])

In [36]:
# DEFINE ROUTER, RETRIEVE from VectorstoreDB and Retrieve from Google search functions!

import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from serpapi import GoogleSearch

import os

# Initialize the embedding model and vector store
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.environ["OPENAI_API_KEY"])
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])

#from langchain_community.tools.tavily_search import TavilySearchResults
# web_search_tool = TavilySearchResults(k=3)

# Assuming Pinecone index is already set up and embeddings are upserted
vector_store = Pinecone(index=index, embedding=embedding_model, text_key="text")

# Initialize the router instructions
router_instructions = """You are an expert at routing a user question to a vectorstore or web search.

The vectorstore contains documents related to Accenture company Quarterly results, Revenue, Earnings for the last four quarters, sales, digital projects, cloud projects and employee details.

Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.

Return JSON with a single key, "datasource", that is either 'websearch' or 'vectorstore' depending on the question."""

# Function to route query based on instructions
def route_query(query):
    routing_response = chat_model.invoke(
        [SystemMessage(content=router_instructions), HumanMessage(content=query)]
    )
    route_decision = json.loads(routing_response.content)
    return route_decision["datasource"]

# Function to retrieve from vector database
def retrieve_from_vectorstore(query):
    retriever = vector_store.as_retriever()
    results = retriever.get_relevant_documents(query)
    document_text = "\n\n".join([doc.page_content for doc in results])

    # Summarize the retrieved documents using LLM 
    prompt = (
        "Based on the following documents, provide a concise answer to the question:\n\n"
        f"Question: {query}\n\n"
        "Documents:\n"
        f"{document_text}\n\n"
        "Answer:"
    )
    
    answer_response = chat_model.invoke([HumanMessage(content=prompt)])
    return answer_response.content
    # return "\n\n".join([doc.page_content for doc in results]) # can be removed if this modules functions right!

# Function to retrieve from Google search using SerpaAPI
def web_search(query):
    search = GoogleSearch({
        "q": query,
        "api_key": serp_api_key, 
        "num": 3  # Limit to top 3 results for conciseness
    })
    results = search.get_dict()

    # Extract relevant text from search results
    combined_text = "\n".join(
        result.get("snippet", "No snippet available")
        for result in results.get("organic_results", [])
    )
    
    # Summarize results using the LLM
    prompt = (
        f"Summarize the following search results to answer the question:\n\n"
        f"Question: {query}\n\n"
        f"Results:\n{combined_text}\n\n"
        "Answer:"
    )
    summary_response = chat_model.invoke([HumanMessage(content=prompt)])
    return summary_response.content


# Function to perform web search (e.g., using Tavily - NOT USING IT)
""" def web_search(query):
    # Fetch results using Tavily
    results = web_search_tool.run(query)
    print("Results:", results)  # Inspect the structure
    combined_text = "\n".join(result.get('text', 'No text available') for result in results if 'text' in result)

    # Summarize results if multiple documents are returned
    prompt = (
        f"Summarize the following web search results to answer the question:\n\n"
        f"Question: {query}\n\n"
        f"Results:\n{combined_text}\n\n"
        "Answer:"
    )
    summary_response = chat_model.invoke([HumanMessage(content=prompt)])
    return summary_response.content """
    

# Main function to handle the query routing and retrieval
def handle_query(query):
    datasource = route_query(query)
    
    if datasource == "vectorstore":
        print("Routing to Vectorstore...")
        return retrieve_from_vectorstore(query)
    elif datasource == "websearch":
        print("Routing to Web Search...")
        return web_search(query)
    else:
        return "Unknown data source. Please check the router configuration."

# Example usage
query = "What is the revenue of Accenture last quarter??"
response = handle_query(query)
print("Response:", response)

query = "What is the EPS of Accenture last quarter?"
response = handle_query(query)
print("Response:", response)

query = "What is the change in Revenue between Fourth and Third quarters?"
response = handle_query(query)
print("Response:", response)

query = "Who is the main competitor of Accenture?"
response = handle_query(query)
print("Response:", response)


Routing to Vectorstore...
Response: The revenue of Accenture last quarter was $16 billion.
Routing to Vectorstore...
Response: The EPS of Accenture last quarter was $2.77.
Routing to Vectorstore...
Response: The change in revenue between the fourth and third quarters was an increase of 6% in North America, 2% in Amia, and 9% in growth markets.
Routing to Web Search...
Response: The main competitor of Accenture is IBM.


In [37]:
""" # Control the recursive depth to avoid Web_search error

import sys
sys.setrecursionlimit(2000)  # Limit to a reasonable depth """


' # Control the recursive depth to avoid Web_search error\n\nimport sys\nsys.setrecursionlimit(2000)  # Limit to a reasonable depth '

In [None]:
# DEFINE ALL THE TOOLS for ReACT agent! No CHATBOT YET!

from langchain.agents import initialize_agent
from langchain.agents import Tool
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
import os
from langchain.agents import initialize_agent
from serpapi import GoogleSearch

# Define the SerpAPI tool
web_search_tool = Tool(
    name="Google Search",
    func=web_search,
    description="Retrieve relevant information using google web search for general or current event questions."
)

# Define tool for vector store retrieval
vector_store_tool = Tool(
    name="VectorStore Retrieval",
    func=retrieve_from_vectorstore,
    description="Retrieve relevant documents from the vector store for questions about Accenture's earnings, revenue, sales, projects, and employee details.",
)

""" # Define tool for web search (Tavily - not Using it!)
web_search_tool = Tool(
    name="Web Search Retrieval",
    func=web_search,
    description="Retrieve information using Tavily web search for general or current event questions not covered in the vector store, particularly about competitors or industry updates.",
)
 """
# Initialize the ReAct agent with both tools
react_agent = initialize_agent(
    tools=[vector_store_tool, web_search_tool],
    agent="zero-shot-react-description",
    llm=chat_model,
    verbose=True,
    max_iterations=3  # Restrict maximum depth to avoid infinite loops
)

# Main function to handle the query using the ReAct agent
def handle_query_with_agent(query):
    # Call the ReAct agent with the query; it will decide which tool to use
    response = react_agent({"input": query})
    return response["output"]

# Example usage
query = "What is the revenue of Accenture last quarter?"
response = handle_query_with_agent(query)
print("Response:", response)

query = "What is the size of Genai project deals for Accenture this year??"
response = handle_query_with_agent(query)
print("Response:", response)

query = "Who are the competitors of Accenture?"
response = handle_query_with_agent(query)
print("Response:", response)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use VectorStore Retrieval to get information about Accenture's revenue last quarter.
Action: VectorStore Retrieval
Action Input: Accenture revenue last quarter[0m
Observation: [36;1m[1;3mAccenture's revenue last quarter was $15.8 billion, representing flat growth compared to the previous year. The company continues to take market share and had a record $21.6 billion in bookings, with strong performance in North America.[0m
Thought:[32;1m[1;3mI now have the answer to the revenue of Accenture last quarter.
Final Answer: Accenture's revenue last quarter was $15.8 billion.[0m

[1m> Finished chain.[0m
Response: Accenture's revenue last quarter was $15.8 billion.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to find information about Accenture's Genai project deals for this year.
Action: VectorStore Retrieval
Action Input: Genai project deals Accenture 2021[0m
Observation: [36;1m[1;3mThe G

In [39]:
# INTERACTIVE CHAT FUNCTION using ReACT Agent - REFINED 2!

import json
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.schema import HumanMessage

# Initialize chat model and memory if not already set up
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Define chatbot interaction with enhanced error handling and source tracking
def chatbot_interaction():
    print("Welcome to the Accenture Q&A Chatbot! Ask any question about Accenture.")
    while True:
        user_query = input("You: ")
        if user_query.lower() in ["exit", "quit", "no"]:
            print("Agent: Thank you for using the Q&A chatbot. Goodbye!")
            break
        
        try:
            # Attempt to call the ReAct agent and capture response
            response = react_agent({"input": user_query})
            
            # Determine the source tool based on response content
            source_tool = "VectorDB" if "VectorStore Retrieval" in response.get("log", "") else "Websearch"
            
            # Parse the response JSON if possible
            if "output" in response and isinstance(response["output"], str):
                try:
                    parsed_response = json.loads(response["output"])
                    # Display question, answer, and source tool
                    if isinstance(parsed_response, dict) and "answer" in parsed_response:
                        print(f"\nAgent (using {source_tool}):\nQuestion: {user_query}\nAnswer: {parsed_response['answer']}")
                    else:
                        print(f"\nAgent (using {source_tool}):\nQuestion: {user_query}\nAnswer: {response['output']}")  # Fallback to raw output
                except json.JSONDecodeError:
                    print("Agent: Error parsing response JSON, displaying raw answer instead.")
                    print(f"Agent (using {source_tool}): {response['output']}")
            else:
                # Fallback for unparsed or unexpected response format
                print("Agent: Unable to process the request in the expected format. Please try again.")
                
        except Exception as e:
            print("Error encountered, displaying default error message.")
            print("Agent: Unable to process the request. Please try again.")
        
        print("\nAgent: Do you have more questions?")

# Start the chatbot interaction
chatbot_interaction()


Welcome to the Accenture Q&A Chatbot! Ask any question about Accenture.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use the VectorStore Retrieval tool to look for information on Accenture's revenue.
Action: VectorStore Retrieval
Action Input: Accenture revenue last quarter[0m
Observation: [36;1m[1;3mAccenture's revenue last quarter was $15.8 billion, representing a flat growth compared to the previous year.[0m
Thought:[32;1m[1;3mI now know the final answer
Final Answer: Accenture's revenue last quarter was $15.8 billion.[0m

[1m> Finished chain.[0m
Agent: Error parsing response JSON, displaying raw answer instead.
Agent (using Websearch): Accenture's revenue last quarter was $15.8 billion.

Agent: Do you have more questions?


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use VectorStore Retrieval to find information on Accenture's projects.
Action: VectorStore Retrieval
Action Input: Genai project deals size Accenture 2021[0m
O

In [None]:
# Gradio placeholder

#!pip install gradio

In [49]:
import json

def gradio_chatbot(user_query):
    # Function to interact with the chatbot using Gradio
    try:
        # Get response from the ReAct agent
        response = react_agent({"input": user_query})

        # Debug: Print log content to verify tool identification
        log_content = response.get("log", "")
        print("Debug - Log Content:", log_content)

        # Check if the response is not empty and parse the response JSON if possible
        if response and "output" in response:
            output_content = response["output"]
            
            # Attempt to parse JSON response if available
            try:
                parsed_response = json.loads(output_content) if isinstance(output_content, str) else output_content
                answer = parsed_response.get("answer", output_content) if isinstance(parsed_response, dict) else output_content
            except json.JSONDecodeError:
                answer = output_content  # Fallback to raw output if JSON parsing fails
            
            # Determine the source based on the actual tool used, with more robust checking
            if "Google Search" in log_content:
                source_tool = "Websearch"
            elif "VectorStore Retrieval" in log_content or "VectorDB" in log_content:
                source_tool = "VectorDB"
            else:
                source_tool = "Unknown Source"  # In case it fails to detect any known tool

            return f"Answer (from {source_tool}):\n{answer}"

        else:
            return "Error: Received an empty or unexpected response format from the agent."

    except Exception as e:
        # Capture any other errors
        return f"Error encountered: {str(e)}"

# Create the Gradio Interface
iface = gr.Interface(
    fn=gradio_chatbot,  # Function to call
    inputs="text",      # Input type: text
    outputs="text",     # Output type: text
    title="Accenture Q&A Chatbot",
    description="Ask any question about Accenture and get answers with sourced information."
)

# Launch the Gradio app
iface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7865
* Running on public URL: https://e01fcb2c32b9ae246f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# !pip install openai 0.27.0
# !pip install -U openai


Collecting openai
  Using cached openai-1.53.1-py3-none-any.whl.metadata (24 kB)
Using cached openai-1.53.1-py3-none-any.whl (387 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.27.0
    Uninstalling openai-0.27.0:
      Successfully uninstalled openai-0.27.0
Successfully installed openai-1.53.1
[0m

In [65]:
# GRADIO WITH AUDIO! 

import openai
import gradio as gr

# Transcription function for audio input using the latest Whisper API
def transcribe_audio(audio_path):
    try:
        with open(audio_path, "rb") as audio_file:
            # Correct method for transcribing audio in latest OpenAI version
            response = openai.Audio.transcribe("whisper-1", audio_file)
            transcription = response["text"]
            print(f"Transcription Result: {transcription}")
            return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return None

def gradio_chatbot(user_query=None, audio_query=None):
    # Transcribe audio if provided
    if audio_query is not None:
        user_query = transcribe_audio(audio_query)
    
    # Check if there's valid input from either text or audio
    if not user_query:
        return "Error: No valid input provided. Please either type or speak your question."

    # Process the query with the ReAct agent (replace 'react_agent' with your actual agent)
    try:
        response = react_agent({"input": user_query})
        # Extract the output and source from the response
        if "output" in response:
            source_tool = "VectorDB" if "VectorStore Retrieval" in response.get("log", "") else "Websearch"
            answer = response["output"]
            return f"Answer (from {source_tool}): {answer}"
        else:
            return "Error: Unable to process the request. Please try again."
    except Exception as e:
        print(f"Error encountered: {e}")
        return f"Error encountered: {str(e)}"

# Gradio interface with audio and text inputs
gr.Interface(
    fn=gradio_chatbot,
    inputs=[
        gr.Textbox(label="Type your question here"),
        gr.Audio(type="filepath", label="Or speak your question here")
    ],
    outputs="text",
    title="Accenture Q&A Chatbot",
    description="Ask any question about Accenture and get answers with sourced information."
).launch(share=True)


* Running on local URL:  http://127.0.0.1:7874
* Running on public URL: https://2a694c9c7d9ca5ea05.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)






[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use VectorStore Retrieval to find information about Accenture's revenue.
Action: VectorStore Retrieval
Action Input: "Accenture revenue last quarter"[0m
Observation: [36;1m[1;3mAccenture's revenue last quarter was $15.8 billion, which was flat compared to the previous year.[0m
Thought:[32;1m[1;3mI should also double-check this information with a Google search to ensure accuracy.
Action: Google Search
Action Input: Accenture revenue last quarter[0m
Observation: [33;1m[1;3mAccenture reported a revenue of $12.8 billion for the last quarter.[0m
Thought:[32;1m[1;3mThere seems to be a discrepancy in the revenue information between the vector store and Google search results. I should try to verify this information from a reliable source.
Action: Google Search
Action Input: Accenture financial report last quarter[0m
Observation: [33;1m[1;3mThe search results provide information about Accenture's financial perf

In [None]:
# Monday - Finetune Retrieval and QA and Chatbot!

# Check Next code, separated for finetuning.


In [None]:
# New Code 11PM SATURDAY # THIS CODE worked using ReAct agent for Transcription, but went in Infinite LOOOOP

""" import os
import yt_dlp
from whisper import load_model
from langchain.agents import initialize_agent, Tool
from langchain.schema import HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

# Default configurations
VIDEO_LINKS_FILE = "video_links.txt"  # File with YouTube links
AUDIO_OUTPUT_TEMPLATE = "audio_{}"  # Template for audio file names without .mp3 extension
WHISPER_MODEL = "base"  # Whisper model for transcription

# Load YouTube links from the file
def load_video_links(filename=VIDEO_LINKS_FILE):
    with open(filename, 'r') as file:
        video_links = [link.strip() for link in file.readlines() if link.strip()]
    return video_links

# Download audio from YouTube video
def download_audio(url, output_name):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [
            {
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }
        ],
        'outtmpl': output_name,  # Filename with .mp3 extension
        'ffmpeg_location': '/usr/bin/ffmpeg'
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return f"{output_name}.mp3"

# Transcribe audio using Whisper
def transcribe_audio_with_whisper(audio_path, model_name=WHISPER_MODEL):
    model = load_model(model_name)
    result = model.transcribe(audio_path)
    return result["text"]

# ReAct Tool for downloading and transcribing all videos
# Split transcriptions into chunks below the token limit - Process in Batches!
def chunk_text(text, max_length=1500):
    words = text.split()
    chunks = []
    chunk = []
    for word in words:
        chunk.append(word)
        if len(" ".join(chunk)) >= max_length:
            chunks.append(" ".join(chunk))
            chunk = []
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

# Process each transcription in chunks
def transcribe_all_videos(input_data=None): # Accept an optional argument
    video_links = load_video_links(VIDEO_LINKS_FILE)
    all_transcriptions = []
    
    for i, video_url in enumerate(video_links):
        print(f"Processing video {i+1}/{len(video_links)}: {video_url}")
        audio_path = download_audio(video_url, AUDIO_OUTPUT_TEMPLATE.format(i))
        full_transcription = transcribe_audio_with_whisper(audio_path)
        transcription_chunks = chunk_text(full_transcription)  # Divide transcription
        
        for j, chunk in enumerate(transcription_chunks):
            # print(f"Processing chunk {j+1} of video {i+1}")
            # Send chunk to the ReAct agent
            # response = react_agent({"input": chunk}) /* Don't call Agent each time */
            all_transcriptions.append({"title": f"Video {i+1} - Chunk {j+1}", "transcription": chunk})
            print(f"Completed chunk {j+1} for video {i+1}")
        
    return all_transcriptions


def transcribe_all_videos(input_data=None):  # Accept an optional argument
    video_links = load_video_links(VIDEO_LINKS_FILE)
    all_transcriptions = []
    for i, video_url in enumerate(video_links):
        print(f"Processing video {i+1}/{len(video_links)}: {video_url}")
        audio_path = download_audio(video_url, AUDIO_OUTPUT_TEMPLATE.format(i))
        transcription = transcribe_audio_with_whisper(audio_path)
        all_transcriptions.append({"title": f"Video {i+1}", "transcription": transcription})
        print(f"Completed transcription for video {i+1}")
    return all_transcriptions

# Define structured output for better organization
response_schema = [
    ResponseSchema(name="transcription", description="Transcribed text from each video in structured format.")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schema)

# Define tool for the agent
transcription_tool = Tool(
    name="TranscribeAllVideos",
    func=transcribe_all_videos,
    description="Transcribes all videos in the provided list and consolidates results."
)

# Initialize the ReAct agent with transcription tool
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
memory = ConversationBufferMemory()

react_agent = initialize_agent(
    tools=[transcription_tool],
    agent="zero-shot-react-description",
    llm=chat_model,
    memory=memory,
    verbose=True
)

# Run agent to perform transcription
def perform_transcription_task():
    user_query = "Transcribe all earnings podcasts from the provided YouTube links."
    transcriptions = transcribe_all_videos()  # Get transcriptions directly
    response = react_agent({"input": user_query, "transcriptions": transcriptions})  # Send entire list to agent once
    print("Agent's Transcription:", response["output"])

def perform_transcription_task():
    user_query = "Transcribe all earnings podcasts from the provided YouTube links."
    response = react_agent({"input": user_query})
    print("Agent's Transcription:", response["output"])

# Start the transcription task
perform_transcription_task()
 """

  chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
  memory = ConversationBufferMemory()
  react_agent = initialize_agent(


Processing video 1/4: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] Extracting URL: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] 089JqnNmJq0: Downloading webpage
[youtube] 089JqnNmJq0: Downloading ios player API JSON
[youtube] 089JqnNmJq0: Downloading mweb player API JSON
[youtube] 089JqnNmJq0: Downloading player 4e23410d
[youtube] 089JqnNmJq0: Downloading m3u8 information
[info] 089JqnNmJq0: Downloading 1 format(s): 251
[download] Destination: audio_0
[download] 100% of   49.02MiB in 00:00:01 at 43.39MiB/s    
[ExtractAudio] Destination: audio_0.mp3
Deleting original file audio_0 (pass -k to keep)


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 181MiB/s]


Processing chunk 1 of video 1
Completed chunk 1 for video 1
Processing chunk 2 of video 1
Completed chunk 2 for video 1
Processing chunk 3 of video 1
Completed chunk 3 for video 1
Processing chunk 4 of video 1
Completed chunk 4 for video 1
Processing chunk 5 of video 1
Completed chunk 5 for video 1
Processing chunk 6 of video 1
Completed chunk 6 for video 1
Processing chunk 7 of video 1
Completed chunk 7 for video 1
Processing chunk 8 of video 1
Completed chunk 8 for video 1
Processing chunk 9 of video 1
Completed chunk 9 for video 1
Processing chunk 10 of video 1
Completed chunk 10 for video 1
Processing chunk 11 of video 1
Completed chunk 11 for video 1
Processing chunk 12 of video 1
Completed chunk 12 for video 1
Processing chunk 13 of video 1
Completed chunk 13 for video 1
Processing chunk 14 of video 1
Completed chunk 14 for video 1
Processing chunk 15 of video 1
Completed chunk 15 for video 1
Processing chunk 16 of video 1
Completed chunk 16 for video 1
Processing chunk 17 of vid

  response = react_agent({"input": user_query, "transcriptions": transcriptions})  # Send entire list to agent once


[32;1m[1;3mI should transcribe all the earnings podcasts from the YouTube links provided to consolidate the results.
Action: TranscribeAllVideos
Action Input: List of YouTube links to earnings podcasts[0mProcessing video 1/4: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] Extracting URL: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] 089JqnNmJq0: Downloading webpage
[youtube] 089JqnNmJq0: Downloading ios player API JSON
[youtube] 089JqnNmJq0: Downloading mweb player API JSON
[youtube] 089JqnNmJq0: Downloading m3u8 information
[info] 089JqnNmJq0: Downloading 1 format(s): 251
[download] Destination: audio_0
[download] 100% of   49.02MiB in 00:00:01 at 43.23MiB/s    
[ExtractAudio] Destination: audio_0.mp3
Deleting original file audio_0 (pass -k to keep)
Processing chunk 1 of video 1
Completed chunk 1 for video 1
Processing chunk 2 of video 1
Completed chunk 2 for video 1
Processing chunk 3 of video 1
Completed chunk 3 for video 1
Processing chunk 4 of video 1
Completed c

KeyboardInterrupt: 