In [3]:
""" !pip install pytube
!pip install openai-whisper
!pip install langchain
!pip install openai
!pip install python-dotenv """

' !pip install pytube\n!pip install openai-whisper\n!pip install langchain\n!pip install openai\n!pip install python-dotenv '

In [4]:
""" !pip install -U langchain-community
!pip install pinecone-client
!pip install --upgrade pytube
!pip install yt-dlp
!pip install imageio[ffmpeg] """

' !pip install -U langchain-community\n!pip install pinecone-client\n!pip install --upgrade pytube\n!pip install yt-dlp\n!pip install imageio[ffmpeg] '

In [5]:
from torch import cuda, torch
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [6]:
from dotenv import load_dotenv, find_dotenv
import getpass
import os

# Load Environment Variable
_ = load_dotenv(find_dotenv())

# Set the environment variable directly in the script
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")

In [12]:
# Import Langsmith (make sure it's installed first)
import langsmith  # Importing Langsmith's library
from langsmith import traceable
from langsmith.wrappers import wrap_openai
from langsmith import Client
from langsmith.evaluation import evaluate
from langsmith.schemas import Example, Run
import getpass
import os

In [13]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "prj-yt-chatbot-qa"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter LANGCHAIN_API_KEY")        # Authenticate with Langsmith

In [16]:
# STEP 1: Load VIDEO, Download Audio, Transcribe into different text files!!!

import os
import yt_dlp
from whisper import load_model
from langchain.agents import initialize_agent, Tool
from langchain.schema import HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI

# Default configurations
VIDEO_LINKS_FILE = "video_links.txt"
AUDIO_OUTPUT_TEMPLATE = "audio_{}"
WHISPER_MODEL = "base"
TRANSCRIPT_DIR = "transcriptions"

# Ensure directory exists for transcriptions
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)

# Load YouTube links from the file
def load_video_links(filename=VIDEO_LINKS_FILE):
    with open(filename, 'r') as file:
        return [link.strip() for link in file.readlines() if link.strip()]

# Download audio from YouTube video
def download_audio(url, output_name):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}],
        'outtmpl': f"{output_name}",
        'ffmpeg_location': '/usr/bin/ffmpeg'
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return f"{output_name}.mp3"

# Transcribe audio using Whisper
def transcribe_audio_with_whisper(audio_path, model_name=WHISPER_MODEL):
    model = load_model(model_name)
    result = model.transcribe(audio_path)
    return result["text"]

# Process each video and save individual transcriptions
def transcribe_all_videos():
    video_links = load_video_links(VIDEO_LINKS_FILE)

    for i, video_url in enumerate(video_links):
        print(f"Processing video {i+1}/{len(video_links)}: {video_url}")
        audio_path = download_audio(video_url, AUDIO_OUTPUT_TEMPLATE.format(i))
        
        # Transcribe the audio and save each transcription to a separate file
        transcription = transcribe_audio_with_whisper(audio_path)
        transcript_file = os.path.join(TRANSCRIPT_DIR, f"transcription_{i+1}.txt")
        
        with open(transcript_file, 'w') as f:
            f.write(transcription)
        
        print(f"Completed transcription for video {i+1}")

# Execute transcription process only, without consolidation
if __name__ == "__main__":
    transcribe_all_videos()
    print("Individual transcription files are ready in the transcriptions folder.")


Processing video 1/8: https://www.youtube.com/watch?v=-RIfno4Sp0g
[youtube] Extracting URL: https://www.youtube.com/watch?v=-RIfno4Sp0g
[youtube] -RIfno4Sp0g: Downloading webpage
[youtube] -RIfno4Sp0g: Downloading ios player API JSON
[youtube] -RIfno4Sp0g: Downloading mweb player API JSON
[youtube] -RIfno4Sp0g: Downloading player 81ef9024
[youtube] -RIfno4Sp0g: Downloading m3u8 information
[info] -RIfno4Sp0g: Downloading 1 format(s): 251
[download] Destination: audio_0
[download] 100% of   48.70MiB in 00:00:00 at 51.13MiB/s    
[ExtractAudio] Destination: audio_0.mp3
Deleting original file audio_0 (pass -k to keep)


100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 57.6MiB/s]


Completed transcription for video 1
Processing video 2/8: https://www.youtube.com/watch?v=3SCOVB1KDFc
[youtube] Extracting URL: https://www.youtube.com/watch?v=3SCOVB1KDFc
[youtube] 3SCOVB1KDFc: Downloading webpage
[youtube] 3SCOVB1KDFc: Downloading ios player API JSON
[youtube] 3SCOVB1KDFc: Downloading mweb player API JSON
[youtube] 3SCOVB1KDFc: Downloading m3u8 information
[info] 3SCOVB1KDFc: Downloading 1 format(s): 251
[download] Destination: audio_1
[download] 100% of   54.16MiB in 00:00:02 at 18.42MiB/s    
[ExtractAudio] Destination: audio_1.mp3
Deleting original file audio_1 (pass -k to keep)
Completed transcription for video 2
Processing video 3/8: https://www.youtube.com/watch?v=qMqP2IwhSnA
[youtube] Extracting URL: https://www.youtube.com/watch?v=qMqP2IwhSnA
[youtube] qMqP2IwhSnA: Downloading webpage
[youtube] qMqP2IwhSnA: Downloading ios player API JSON
[youtube] qMqP2IwhSnA: Downloading mweb player API JSON
[youtube] qMqP2IwhSnA: Downloading m3u8 information
[info] qMqP2Iw

In [17]:
# STEP 2: Define Pinecone as Vector DB & Define Index

from pinecone import Pinecone
import os
from getpass import getpass

# Securely fetch Pinecone API keys                    
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY") or getpass("Enter Pinecone API key: ")

In [22]:
#print(os.environ["PINECONE_API_KEY"])
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [24]:
# Create INDEX 

import os
import time
from tqdm.auto import tqdm
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone client with the updated API
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
#spec = ServerlessSpec(cloud="aws", region="us-west-1")          # Specify Pinecone environment
index_name = "transcription-accnt-earn-index-no-consol"             # Create New Index

# Check if the index exists; create if it doesn’t
if index_name not in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=1536,  # OpenAI embedding dimension
        metric="dotproduct",
        spec=spec
    )
    # Wait for the index to be ready
    while not pc.describe_index(index_name).status["ready"]:
        print("Waiting for index to be ready...")
        time.sleep(1)

# Connect to the index for upsertion
index = pc.Index(index_name)
time.sleep(1)
# view index stats
print("Index statistics:", index.describe_index_stats())

Index statistics: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [28]:
# STEP 3: Vectorize and Embed each Transcription file

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

# Initialize embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Function to split and vectorize text from each transcription file
def vectorize_individual_transcriptions(transcription_dir="transcriptions"):
    # Initialize list to hold vectorized segments for all files
    all_vectorized_segments = []
    
    # Iterate through each transcription file in the directory
    for filename in sorted(os.listdir(transcription_dir)):
        file_path = os.path.join(transcription_dir, filename)
        
        # Skip directories and hidden files
        if not os.path.isfile(file_path) or filename.startswith('.'):
            continue

        # Read transcription text
        with open(file_path, 'r') as f:
            transcription_text = f.read()
        
        # Split text into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
        chunks = text_splitter.split_text(transcription_text)
        
        # Vectorize each chunk with metadata
        for chunk_idx, chunk in enumerate(chunks):
            embedding = embedding_model.embed_documents([chunk])[0]  # Get embedding
            
            # Metadata includes file identifier and chunk index for reference
            metadata = {
                "file_name": filename,
                "file_id": filename.replace("transcription_", "").replace(".txt", ""),
                "chunk_idx": chunk_idx,
            }
            all_vectorized_segments.append({
                "embedding": embedding,
                "text": chunk,
                "metadata": metadata
            })
        
        print(f"Vectorization complete for {filename}.")

    return all_vectorized_segments

# Execute the vectorization task
if __name__ == "__main__":
    vectorized_segments = vectorize_individual_transcriptions()
    print("All transcription files have been vectorized. Ready for storage in Pinecone.")


Vectorization complete for transcription_1.txt.
Vectorization complete for transcription_2.txt.
Vectorization complete for transcription_3.txt.
Vectorization complete for transcription_4.txt.
Vectorization complete for transcription_5.txt.
Vectorization complete for transcription_6.txt.
Vectorization complete for transcription_7.txt.
Vectorization complete for transcription_8.txt.
All transcription files have been vectorized. Ready for storage in Pinecone.


In [47]:
# STEP 4 - UPSERT embeddings to Pinecone

from tqdm.auto import tqdm

# Function to upsert vectorized segments to Pinecone
def upsert_embeddings_to_pinecone(vectorized_segments, index):
    for segment in tqdm(vectorized_segments, desc="Upserting segments to Pinecone"):
        # Prepare data for upsert
        segment_id = segment["metadata"]["file_id"] + "_" + str(segment["metadata"]["chunk_idx"])  # Unique ID for each chunk
        embedding = segment["embedding"]
        metadata = segment["metadata"]
        metadata["text"] = segment["text"]  # Adds the chunk's text content

        # Upsert the segment into Pinecone
        try:
            index.upsert(vectors=[(segment_id, embedding, metadata)])
        except Exception as e:
            print(f"Error upserting segment {segment_id}: {e}")

    print("All segments upserted into Pinecone.")

# Example usage:
upsert_embeddings_to_pinecone(vectorized_segments, index)


Upserting segments to Pinecone:   0%|          | 0/349 [00:00<?, ?it/s]

All segments upserted into Pinecone.


In [None]:
import os
import getpass


# Fetch the SerpAPI key securely
os.environ["SERPAPI_API_KEY"] = os.getenv("SERPAPI_API_KEY") or getpass.getpass("Enter SerpAPI key: ")

# Example usage in SerpAPI tool setup
serp_api_key = os.environ["SERPAPI_API_KEY"]

In [50]:
# DEFINE ROUTER, RETRIEVE from VectorstoreDB and Retrieve from Google search functions!

import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage, AIMessage
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from serpapi import GoogleSearch
import os

# Initialize the embedding model and vector store
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.environ["OPENAI_API_KEY"])
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])

# Pinecone index is already set up and embeddings are upserted
vector_store = Pinecone(index=index, embedding=embedding_model, text_key="text")

# Initialize the router instructions
router_instructions = """You are an expert at routing a user question to a vectorstore or web search.

The vectorstore contains documents related to Accenture company Quarterly results, Revenue, Earnings for the last four quarters, sales, digital projects, cloud projects and employee details.

Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.

Return JSON with a single key, "datasource", that is either 'websearch' or 'vectorstore' depending on the question."""

# Function to route query based on instructions
def route_query(query):
    routing_response = chat_model.invoke(
        [SystemMessage(content=router_instructions), HumanMessage(content=query)]
    )
    route_decision = json.loads(routing_response.content)
    return route_decision["datasource"]

# Function to retrieve from vector database
def retrieve_from_vectorstore(query):
    retriever = vector_store.as_retriever()
    results = retriever.get_relevant_documents(query)
    document_text = "\n\n".join([doc.page_content for doc in results])

    # Summarize the retrieved documents using LLM 
    prompt = (
        "Based on the following documents, provide a concise answer to the question:\n\n"
        f"Question: {query}\n\n"
        "Documents:\n"
        f"{document_text}\n\n"
        "Answer:"
    )    
    answer_response = chat_model.invoke([HumanMessage(content=prompt)])
    return answer_response.content
    
# Function to retrieve from Google search
def web_search(query):
    search = GoogleSearch({
        "q": query,
        "api_key": serp_api_key, 
        "num": 3  # Limit to top 3 results for conciseness
    })
    results = search.get_dict()

    # Extract relevant text from search results
    combined_text = "\n".join(
        result.get("snippet", "No snippet available")
        for result in results.get("organic_results", [])
    )
    
    # Summarize results using the LLM
    prompt = (
        f"Summarize the following search results to answer the question:\n\n"
        f"Question: {query}\n\n"
        f"Results:\n{combined_text}\n\n"
        "Answer:"
    )
    summary_response = chat_model.invoke([HumanMessage(content=prompt)])
    return summary_response.content  

# Main function to handle the query routing and retrieval
def handle_query(query):
    datasource = route_query(query)
    
    if datasource == "vectorstore":
        print("Routing to Vectorstore...")
        return retrieve_from_vectorstore(query)
    elif datasource == "websearch":
        print("Routing to Web Search...")
        return web_search(query)
    else:
        return "Unknown data source. Please check the router configuration."

In [None]:
# DEFINE ALL THE TOOLS for ReACT agent! 

from langchain.agents import initialize_agent
from langchain.agents import Tool
from langchain.chat_models import ChatOpenAI
from serpapi import GoogleSearch
from langchain.memory import ConversationBufferMemory
import os

# Initialize memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


# Define the Web search tool using SerpAPI
web_search_tool = Tool(
    name="Google Search",
    func=web_search,
    description="Retrieve relevant information using google web search for general or current event questions."
)

# Define tool for vector store retrieval
vector_store_tool = Tool(
    name="VectorStore Retrieval",
    func=retrieve_from_vectorstore,
    description="Retrieve relevant documents from the vector store for questions about Accenture's earnings, revenue, sales, projects, and employee details.",
)

# Revised prompt for the agent with improved answer formatting
agent_prompt = """
You are an expert assistant specializing in Accenture's financial data and competitor analysis.

For each user query:
1. Answer strictly based on provided context or vector data.
2. Be concise and use up to three sentences.
3. If the answer is found, respond directly by starting with "Final Answer:" without additional steps or thoughts.

Now, respond to the question:
{user_query}
"""


# Define the ReAct agent with the custom prompt in each tool
react_agent = initialize_agent(
    tools=[vector_store_tool, web_search_tool],
    agent="zero-shot-react-description",
    llm=chat_model,
    memory=memory,  # Integrate conversation memory
    verbose=True,
    max_iterations=5,  # Restrict maximum depth to avoid infinite loops
    handle_parsing_errors=True  # Retry on output parsing errors
)

# Main function to handle the query using the ReAct agent with the custom prompt
def handle_query_with_agent(query):
    # Format the query with the agent's prompt
    formatted_prompt = agent_prompt.format(user_query=query)
    
    # Use the prompt in the agent invocation
    try:
        response = react_agent({"input": formatted_prompt})
        # If a valid answer is provided, return it
        if "output" in response:
            return response["output"]
        else:
            # Fallback message if no answer found within the max iterations
            return "I don't know. Unable to find the answer within the provided context."
    except Exception as e:
        # In case of unexpected errors, provide a helpful message
        return f"Error encountered: {str(e)}"

In [58]:
# INTERACTIVE CHAT FUNCTION using ReACT Agent - REFINED 3! Works perfect! TUESDAY

import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, AIMessage
from langchain.memory import ConversationBufferMemory

# Initialize chat model and memory
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Function to clear memory
def reset_memory():
    memory.clear()
    print("Memory has been reset.")

# Interactive chat function with datasource-based source tagging
def chatbot_interaction():
    print("Welcome to the Accenture Q&A Chatbot! Ask any question about Accenture.")
    while True:
        user_query = input("You: ")
        if user_query.lower() in ["exit", "quit", "no"]:
            print("Agent: Thank you for using the Q&A chatbot. Goodbye!")
            break
        
        # Determine the datasource
        datasource = route_query(user_query)
        
        # Tag source based on datasource
        source_label = "VectorDB" if datasource == "vectorstore" else "Google Search" if datasource == "websearch" else "Unknown Source"        
        
        try:
            # Get response from the handle_query_with_agent function instead of calling react_agent directly
            answer = handle_query_with_agent(user_query)
            print(f"\nAgent (using {source_label}):\nQuestion: {user_query}\nAnswer: {answer}")
        
        except Exception as e:
            print(f"Error encountered: {e}")
            print("Agent: Unable to process the request. Please try again.")
        
        print("\nAgent: Do you have more questions?")

# Start the chatbot interaction
chatbot_interaction()

Welcome to the Accenture Q&A Chatbot! Ask any question about Accenture.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should retrieve the revenue information for Q3 2024 from the vector store.
Action: VectorStore Retrieval
Action Input: Q3 2024 revenue Accenture[0m
Observation: [36;1m[1;3mThe projected revenue for Accenture in Q3 2024 is expected to be in the range of 15.85 to 16.45 billion, with an estimated 2 to 5 percent growth in local currency compared to the previous year.[0m
Thought:[32;1m[1;3mI have found the revenue information for Q3 2024.
Final Answer: The projected revenue for Accenture in Q3 2024 is expected to be in the range of 15.85 to 16.45 billion.[0m

[1m> Finished chain.[0m

Agent (using VectorDB):
Question: revenue of Q3 2024?
Answer: The projected revenue for Accenture in Q3 2024 is expected to be in the range of 15.85 to 16.45 billion.

Agent: Do you have more questions?
Agent: Thank you for using the Q&A chatbot. Goodbye!


In [None]:
# GRADIO only with TEXT - REFINED! Works Perfect!

import json
import openai
import gradio as gr

def gradio_chatbot(user_query):
    # Determine the source using the routing function
    datasource = route_query(user_query)
    
    # Tag source based on the datasource returned
    source_label = "VectorDB" if datasource == "vectorstore" else "Google Search" if datasource == "websearch" else "Unknown Source"
    
    try:
        # Get response from the ReAct agent
        response = react_agent({"input": user_query})

        # Check if the response is not empty and parse the response JSON if possible
        if response and "output" in response:
            output_content = response["output"]
            
            # Attempt to parse JSON response if available
            try:
                parsed_response = json.loads(output_content) if isinstance(output_content, str) else output_content
                answer = parsed_response.get("answer", output_content) if isinstance(parsed_response, dict) else output_content
            except json.JSONDecodeError:
                answer = output_content  # Fallback to raw output if JSON parsing fails

            return f"Answer (from {source_label}):\n{answer}"

        else:
            return "Error: Received an empty or unexpected response format from the agent."

    except Exception as e:
        # Capture any other errors
        return f"Error encountered: {str(e)}"

# Create the Gradio Interface
iface = gr.Interface(
    fn=gradio_chatbot,  # Function to call
    inputs="text",      # Input type: text
    outputs="text",     # Output type: text
    title="Accenture Q&A Chatbot",
    description="Ask any question about Accenture and get answers with sourced information."
)

# Launch the Gradio app
iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://05e3852bb6023b35a3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)






[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use VectorStore Retrieval to find this specific financial information.
Action: VectorStore Retrieval
Action Input: "Accenture EPS fourth quarter 2024"[0m
Observation: [36;1m[1;3mThe Accenture EPS for the fourth quarter of 2024 is not provided in the documents.[0m
Thought:[32;1m[1;3mI should try using Google Search to find this specific financial information.
Action: Google Search
Action Input: "Accenture EPS fourth quarter 2024"[0m
Observation: [33;1m[1;3mAccenture reported an adjusted EPS of $2.79 for the fourth quarter of fiscal 2024, which was a 3% increase over the previous quarter.[0m
Thought:[32;1m[1;3mI now have the final answer.
Final Answer: The EPS of Accenture in the fourth quarter of 2024 was $2.79.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use VectorStore Retrieval to find information about Accenture's earnings.
Action: VectorStore Re