In [1]:
!pip install pytube
!pip install openai-whisper
!pip install langchain
!pip install openai
!pip install python-dotenv

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0
[0mCollecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting numba (from openai-whisper)
  Downloading numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.

In [2]:
!pip install -U langchain-community
!pip install pinecone-client
!pip install --upgrade pytube
!pip install yt-dlp
!pip install imageio[ffmpeg]

Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-non

In [3]:
from torch import cuda, torch
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [4]:
from dotenv import load_dotenv, find_dotenv
import getpass
import os

# Load Environment Variable
_ = load_dotenv(find_dotenv())

# Set the environment variable directly in the script
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")

#OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')
# initialize connection to pinecone (get API key at app.pinecone.io)
#PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") # or "YOUR_API_KEY"

In [5]:
# Access the environment variable with os.environ
#print(os.environ["OPENAI_API_KEY"])

In [6]:
# Sunday 3-Nov
# STEP 1: Transcription and Consolidation Tasks (without agent to improve the process and reduce the time)

import os
import yt_dlp
from whisper import load_model
from langchain.agents import initialize_agent, Tool
from langchain.schema import HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI

# Default configurations
VIDEO_LINKS_FILE = "video_links.txt"
AUDIO_OUTPUT_TEMPLATE = "audio_{}"
WHISPER_MODEL = "base"
TRANSCRIPT_DIR = "transcriptions"

# Ensure directory exists for transcriptions
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)

# Load YouTube links from the file
def load_video_links(filename=VIDEO_LINKS_FILE):
    with open(filename, 'r') as file:
        return [link.strip() for link in file.readlines() if link.strip()]

# Download audio from YouTube video
def download_audio(url, output_name):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}],
        'outtmpl': f"{output_name}.mp3",
        'ffmpeg_location': '/usr/bin/ffmpeg'
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return f"{output_name}.mp3"

# Transcribe audio using Whisper
def transcribe_audio_with_whisper(audio_path, model_name=WHISPER_MODEL):
    model = load_model(model_name)
    result = model.transcribe(audio_path)
    return result["text"]

# Process each video, saving individual transcriptions
def transcribe_all_videos():
    video_links = load_video_links(VIDEO_LINKS_FILE)
    all_transcriptions = []

    for i, video_url in enumerate(video_links):
        print(f"Processing video {i+1}/{len(video_links)}: {video_url}")
        audio_path = download_audio(video_url, AUDIO_OUTPUT_TEMPLATE.format(i))
        
        # Transcribe in chunks to prevent reprocessing, saving each transcription file
        transcription = transcribe_audio_with_whisper(audio_path)
        transcript_file = os.path.join(TRANSCRIPT_DIR, f"transcription_{i+1}.txt")
        
        with open(transcript_file, 'w') as f:
            f.write(transcription)
        
        all_transcriptions.append({"title": f"Video {i+1}", "transcription": transcription})
        print(f"Completed transcription for video {i+1}")

    return all_transcriptions

# Consolidate transcriptions for LLM analysis
def consolidate_transcriptions():
    consolidated_text = ""
    for file in sorted(os.listdir(TRANSCRIPT_DIR)):
        file_path = os.path.join(TRANSCRIPT_DIR, file)
        
        # Check if the file_path is a file (not a directory)
        if os.path.isfile(file_path):
            with open(file_path, 'r') as f:
                consolidated_text += f.read() + "\n"
    return consolidated_text


# Agent analysis task with consolidated data
""" def perform_analysis_task(consolidated_text):
    chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.getenv("OPENAI_API_KEY"))
    memory = ConversationBufferMemory()

    react_agent = initialize_agent(
        tools=[],
        agent="zero-shot-react-description",
        llm=chat_model,
        memory=memory,
        verbose=True
    )

    user_query = "Consolidate and analyze the transcriptions of the earnings podcasts."
    response = react_agent({"input": user_query, "transcriptions": consolidated_text})
    print("Agent's Final Output:", response["output"]) """

# Execute process
if __name__ == "__main__":
    transcribe_all_videos()
    consolidated_text = consolidate_transcriptions()
    """ perform_analysis_task(consolidated_text) """
    print("Consolidation Complete. Transcriptions are ready for vectorization and further processing.")

Processing video 1/4: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] Extracting URL: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] 089JqnNmJq0: Downloading webpage
[youtube] 089JqnNmJq0: Downloading ios player API JSON
[youtube] 089JqnNmJq0: Downloading mweb player API JSON
[youtube] 089JqnNmJq0: Downloading player 4e23410d
[youtube] 089JqnNmJq0: Downloading m3u8 information
[info] 089JqnNmJq0: Downloading 1 format(s): 251
[download] audio_0.mp3 has already been downloaded
[download] 100% of   83.94MiB
[ExtractAudio] Not converting audio audio_0.mp3; file is already in target format mp3


100%|███████████████████████████████████████| 139M/139M [00:05<00:00, 26.8MiB/s]


Completed transcription for video 1
Processing video 2/4: https://www.youtube.com/watch?v=IskUO7MoV2s
[youtube] Extracting URL: https://www.youtube.com/watch?v=IskUO7MoV2s
[youtube] IskUO7MoV2s: Downloading webpage
[youtube] IskUO7MoV2s: Downloading ios player API JSON
[youtube] IskUO7MoV2s: Downloading mweb player API JSON
[youtube] IskUO7MoV2s: Downloading m3u8 information
[info] IskUO7MoV2s: Downloading 1 format(s): 251
[download] audio_1.mp3 has already been downloaded
[download] 100% of   81.06MiB
[ExtractAudio] Not converting audio audio_1.mp3; file is already in target format mp3
Completed transcription for video 2
Processing video 3/4: https://www.youtube.com/watch?v=BbqwBMct8-0
[youtube] Extracting URL: https://www.youtube.com/watch?v=BbqwBMct8-0
[youtube] BbqwBMct8-0: Downloading webpage
[youtube] BbqwBMct8-0: Downloading ios player API JSON
[youtube] BbqwBMct8-0: Downloading mweb player API JSON
[youtube] BbqwBMct8-0: Downloading m3u8 information
[info] BbqwBMct8-0: Download

In [7]:
# Step 2: new with chunk before vectorization no ReAct Agent!

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

# Split consolidated text into manageable chunks
def split_text_for_vectorization(consolidated_text, max_tokens=1500):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_tokens, chunk_overlap=200)
    return text_splitter.split_text(consolidated_text)

# Function to perform vectorization on each chunk
def perform_vectorization_task(consolidated_text):
    chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.getenv("OPENAI_API_KEY"))
    memory = ConversationBufferMemory()
    
    # Initialize the embedding model and the agent with vectorization tool
    embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
    chunks = split_text_for_vectorization(consolidated_text)
    
    vectorized_segments = []
    for i, chunk in enumerate(chunks):
        # print(f"Vectorizing chunk {i+1}/{len(chunks)}")
        
        # Generate embeddings for this chunk
        embeddings = embedding_model.embed_documents([chunk])
        
        # Store each embedding
        vectorized_segments.append({
            "id": f"segment_{i}",
            "text": chunk,
            "embedding": embeddings[0]  # embedding_model.embed_documents returns a list
        })

    print("Vectorization complete. Ready for storage in a vector database.")
    return vectorized_segments

# Run vectorization task after consolidation
if __name__ == "__main__":
    # consolidated_text = consolidate_transcriptions()
    vectorized_segments = perform_vectorization_task(consolidated_text)


  chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.getenv("OPENAI_API_KEY"))
  memory = ConversationBufferMemory()
  embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")


Vectorization complete. Ready for storage in a vector database.


In [10]:
# STEP 3 - define Pinecone as Vector DB & Define Index & Embeddings.

from pinecone import Pinecone
import os
from getpass import getpass

# Securely fetch OpenAI and Pinecone API keys                    
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY") or getpass("Enter Pinecone API key: ")
#os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") or getpass("Enter OpenAI API key: ")

In [None]:
#print(os.environ["PINECONE_API_KEY"])

In [11]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [12]:
# STEP 4: Create INDEX using EMBEDDINGS OpenAI-ada-002 - creating the Index for 'Accenture Earnings'

import os
import time
from getpass import getpass
from tqdm.auto import tqdm
from langchain.embeddings import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec

# Securely fetch OpenAI and Pinecone API keys
#os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") or getpass("Enter OpenAI API key: ")
#os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY") or getpass("Enter Pinecone API key: ")
#os.environ["PINECONE_ENV"] = os.getenv("PINECONE_ENV") or "us-west1-gcp"  # Example environment

# Initialize OpenAI embeddings model
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.environ["OPENAI_API_KEY"])

# Initialize Pinecone client with the updated API
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
#spec = ServerlessSpec(cloud="aws", region="us-west-1")  # Specify Pinecone environment
index_name = "transcription-accnt-earn-index-2"             # Create New Index

# Check if the index exists; create if it doesn’t
if index_name not in [index_info["name"] for index_info in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=1536,  # OpenAI embedding dimension
        metric="dotproduct",
        spec=spec
    )
    # Wait for the index to be ready
    while not pc.describe_index(index_name).status["ready"]:
        print("Waiting for index to be ready...")
        time.sleep(1)

# Connect to the index for upsertion
index = pc.Index(index_name)
time.sleep(1)
# view index stats
print("Index statistics:", index.describe_index_stats())

Index statistics: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [13]:
# STEP 5 - UPSERT embeddings to Pinecone in batches
batch_size = 50
for i in tqdm(range(0, len(vectorized_segments), batch_size)):
    batch = vectorized_segments[i:i + batch_size]
    ids = [segment["id"] for segment in batch]
    embeds = [segment["embedding"] for segment in batch]
    metadatas = [{"text": segment["text"]} for segment in batch]

    # Upsert each batch to Pinecone
    try:
        index.upsert(vectors=list(zip(ids, embeds, metadatas)))
    except Exception as e:
        print(f"Error upserting batch {i // batch_size + 1}: {e}")

print("All segments upserted into Pinecone.")


  0%|          | 0/4 [00:00<?, ?it/s]

All segments upserted into Pinecone.


In [14]:
print("Index statistics:", index.describe_index_stats())

Index statistics: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 174}},
 'total_vector_count': 174}


In [15]:
!pip install langchain-community

[0m

In [16]:
# Define ROUTER for query to be passed to VectorDB or Websearch (tavily)

import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

# Initialize LLM in JSON mode (replace 'gpt-3.5-turbo' with any model configured for JSON output)
llm_json_mode = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])

# Define the routing instructions
router_instructions = """You are an expert at routing a user question to a vectorstore or web search.

The vectorstore contains documents related to Accenture company Earnings results, Revenue, Earnings for last four quarters, sales, digital (cloud and Genai) projects and employee details.

Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.

Return JSON with a single key, "datasource", that is either 'websearch' or 'vectorstore' depending on the question."""

# Function to route based on the instructions
def route_query(query):
    routing_response = llm_json_mode.invoke(
        [SystemMessage(content=router_instructions), HumanMessage(content=query)]
    )
    route_decision = json.loads(routing_response.content)
    return route_decision["datasource"]

# Example queries to test the router
test_queries = [
    "What is the revenue of Accenture last quarter?",
    "What is the EPS for last quarter?",
    "What is the total no of employee count in Accenture company?",
    "What is the size of Genai project deals for Accenture this year?",
]

# Execute the router on test queries
for query in test_queries:
    datasource = route_query(query)
    print(f"Query: {query}\nRouted to: {datasource}\n")


Query: What is the revenue of Accenture last quarter?
Routed to: vectorstore

Query: What is the EPS for last quarter?
Routed to: vectorstore

Query: What is the total no of employee count in Accenture company?
Routed to: vectorstore

Query: What is the size of Genai project deals for Accenture this year?
Routed to: websearch



In [None]:
#!pip install tavily-python

Collecting tavily-python
  Downloading tavily_python-0.5.0-py3-none-any.whl.metadata (11 kB)
Downloading tavily_python-0.5.0-py3-none-any.whl (14 kB)
Installing collected packages: tavily-python
Successfully installed tavily-python-0.5.0
[0m

In [18]:
import os
import getpass

os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY") or getpass.getpass("Enter Tavily API key: ")

In [19]:
import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.tools.tavily_search import TavilySearchResults

import os

# Initialize the embedding model and vector store
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.environ["OPENAI_API_KEY"])
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
web_search_tool = TavilySearchResults(k=3)

# Assuming Pinecone index is already set up and embeddings are upserted
vector_store = Pinecone(index=index, embedding=embedding_model, text_key="text")

# Initialize the router instructions
router_instructions = """You are an expert at routing a user question to a vectorstore or web search.

The vectorstore contains documents related to Accenture company Earnings results, Revenue, Earnings for last four quarters, sales, digital (cloud and Genai) projects and employee details.

Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.

Return JSON with a single key, "datasource", that is either 'websearch' or 'vectorstore' depending on the question."""

# Function to route query based on instructions
def route_query(query):
    routing_response = chat_model.invoke(
        [SystemMessage(content=router_instructions), HumanMessage(content=query)]
    )
    route_decision = json.loads(routing_response.content)
    return route_decision["datasource"]

# Function to retrieve from vector database
def retrieve_from_vectorstore(query):
    retriever = vector_store.as_retriever()
    results = retriever.get_relevant_documents(query)
    document_text = "\n\n".join([doc.page_content for doc in results])

    # Summarize the retrieved documents using LLM 
    prompt = (
        "Based on the following documents, provide a concise answer to the question:\n\n"
        f"Question: {query}\n\n"
        "Documents:\n"
        f"{document_text}\n\n"
        "Answer:"
    )
    
    answer_response = chat_model.invoke([HumanMessage(content=prompt)])
    return answer_response.content
    # return "\n\n".join([doc.page_content for doc in results]) # can be removed if this modules functions right!

# Function to perform web search (e.g., using Tavily)
def web_search(query):
    # Fetch results using Tavily
    results = web_search_tool.run(query)
    print("Results:", results)  # Inspect the structure
    combined_text = "\n".join(result.get('text', 'No text available') for result in results if 'text' in result)

    # Summarize results if multiple documents are returned
    prompt = (
        f"Summarize the following web search results to answer the question:\n\n"
        f"Question: {query}\n\n"
        f"Results:\n{combined_text}\n\n"
        "Answer:"
    )
    summary_response = chat_model.invoke([HumanMessage(content=prompt)])
    return summary_response.content
    

# Main function to handle the query routing and retrieval
def handle_query(query):
    datasource = route_query(query)
    
    if datasource == "vectorstore":
        print("Routing to Vectorstore...")
        return retrieve_from_vectorstore(query)
    elif datasource == "websearch":
        print("Routing to Web Search...")
        return web_search(query)
    else:
        return "Unknown data source. Please check the router configuration."

# Example usage
query = "What is the revenue of Accenture last quarter??"
response = handle_query(query)
print("Response:", response)

query = "What is the EPS of Accenture last quarter?"
response = handle_query(query)
print("Response:", response)

query = "Who is the main competitor of Accenture?"
response = handle_query(query)
print("Response:", response)


  vector_store = Pinecone(index=index, embedding=embedding_model, text_key="text")


Routing to Vectorstore...


  results = retriever.get_relevant_documents(query)


Response: The revenue of Accenture last quarter was $16 billion.
Routing to Vectorstore...
Response: The EPS of Accenture last quarter was $2.77.
Routing to Web Search...
Results: [{'url': 'https://businesschronicler.com/competitors/accenture-competitors-analysis/', 'content': "List of Accenture's Main Competitors. PwC; Deloitte; EY (Ernst & Young) KPMG; Accenture Business Strategy. Accenture's business strategy focuses on delivering 360-degree value for clients by embracing technological innovations to deliver the best consulting service possible across 19 industries.. The company was founded in 1989 when it was called Andersen Consulting."}, {'url': 'https://www.investopedia.com/articles/markets/101615/who-are-accentures-main-competitors.asp', 'content': 'While the company does not divulge the names of its clients, it claims to serve roughly 90 of the top 100 global corporations and more than 80 of the 100 largest U.S.-based companies.\n Accenture got its start in the 1950s as the bu

In [None]:
# STEP 6: SETUP Conversational Agent using Langchain using ConversationalRetrievalChain, QA Tool, LLM (gpt 3.5-turbo) and ReAct Agent

from langchain.agents import initialize_agent, Tool
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
#from langchain.retrievers import VectorStoreRetriever
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage
import json
from langchain.agents import initialize_agent, Tool
from getpass import getpass
import pinecone

# Initialize the OpenAI embedding model and Pinecone vector store
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=os.environ["OPENAI_API_KEY"])
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])

# Wrap Pinecone index with LangChain's Pinecone vector store
vector_store = Pinecone(index=index, embedding=embedding_model, text_key="text")

# Define the response schema for structured output
response_schema = [
    ResponseSchema(name="answer", description="Main response text in a complete paragraph")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schema)

# Prompt template for ensuring JSON-formatted responses
response_template = PromptTemplate(
    template=(
        "Answer the question below in a complete paragraph and respond in *strict JSON* format:\n\n"
        "{format_instructions}\n\n"
        "Question: {question}"
    ),
    input_variables=["question"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()}
)

# Define the retrieval tool for the ReAct agent
def retrieval_tool(query):
    retriever = vector_store.as_retriever()
    results = retriever.get_relevant_documents(query)
    return "\n\n".join([doc.page_content for doc in results])

# Define the tool for the ReAct agent
qa_tool = Tool(
    name="Answer Questions",
    func=lambda query: chat_model.invoke([HumanMessage(content=response_template.format(question=query))]),
    description="Use this tool to answer questions based on the retrieved documents."
)

# Initialize ReAct agent with the QA tool
react_agent = initialize_agent(
    tools=[qa_tool],
    agent="zero-shot-react-description",
    llm=chat_model,
    memory=memory,
    verbose=True,
    handle_parsing_errors=True
)

# Interactive chat function with improved parsing logic
def interactive_chat():
    print("Welcome to the Q&A Chatbot! Ask me anything.")
    while True:
        user_query = input("You: ")
        if user_query.lower() == "no":
            print("Agent: Thank you for using the Q&A chatbot. Goodbye!")
            break
        
        # Generate response with JSON format instructions
        response = react_agent({"input": user_query})
        
        # Attempt to parse and format the response as JSON
        try:
            parsed_response = output_parser.parse(response["output"])
            print("Agent's Response:\n", parsed_response["answer"])
        except Exception:
            print("Error parsing response, displaying raw answer instead.")
            print("Raw Response:\n", response["output"])
        
        print("\nAgent: Do you have more questions?")

# Start the interactive chat
interactive_chat()

Welcome to the Q&A Chatbot! Ask me anything.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use the Answer Questions tool to find the total employee count in Accenture.
Action: Answer Questions
Action Input: total employee count in Accenture[0m
Observation: [36;1m[1;3mcontent='```json\n{\n\t"answer": "As of the latest data available, the total employee count in Accenture is approximately 624,000 worldwide. Accenture is a global professional services company that operates in more than 200 cities across 50 countries, offering a wide range of services in consulting, technology, and outsourcing."\n}\n```' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 68, 'prompt_tokens': 81, 'total_tokens': 149, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': 

In [None]:
# Monday - Finetune Retrieval and QA and Chatbot!

# Check Next code, separated for finetuning.


In [None]:
# DON'T WORK! Error! 2: Define the ReAct agent for vectorization as a tool

""" from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.agents import initialize_agent, Tool
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI

# Function to perform vectorization
def vectorize_text(consolidated_text):
    embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=400)
    segments = text_splitter.split_text(consolidated_text)

    vectorized_segments = []
    embeddings = embedding_model.embed_documents(segments)

    for i, (segment, vector) in enumerate(zip(segments, embeddings)):
        vectorized_segments.append({
            "id": f"segment_{i}",
            "text": segment,
            "embedding": vector
        })

    return vectorized_segments

# Define the vectorization tool
vectorization_tool = Tool(
    name="VectorizeText",
    func=vectorize_text,
    description="Segments and vectorizes consolidated transcriptions for storage in a vector database."
)

# Initialize the agent with the vectorization tool
def perform_vectorization_task(consolidated_text):
    chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.getenv("OPENAI_API_KEY"))
    memory = ConversationBufferMemory()

    react_agent = initialize_agent(
        tools=[vectorization_tool],
        agent="zero-shot-react-description",
        llm=chat_model,
        memory=memory,
        verbose=True
    )

    # User query for vectorization
    user_query = "Vectorize the consolidated transcriptions for storage in a vector database."
    input_content = {"input": f"{user_query}\n\n{consolidated_text}"}               # single input to the React Agent
    response = react_agent(input_content)
    #response = react_agent({"input": user_query, "consolidated_text": consolidated_text})
    print("Agent's Vectorization Output:", response["output"])

# Run vectorization task after consolidation
if __name__ == "__main__":
    # transcribe_all_videos() # already part of Step1 above
    consolidated_text = consolidate_transcriptions()
    perform_vectorization_task(consolidated_text)
 """



[1m> Entering new AgentExecutor chain...[0m


BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 47049 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [None]:
# New Code 11PM SATURDAY # THIS CODE worked using ReAct agent for Transcription, but went in Infinite LOOOOP

""" import os
import yt_dlp
from whisper import load_model
from langchain.agents import initialize_agent, Tool
from langchain.schema import HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

# Default configurations
VIDEO_LINKS_FILE = "video_links.txt"  # File with YouTube links
AUDIO_OUTPUT_TEMPLATE = "audio_{}"  # Template for audio file names without .mp3 extension
WHISPER_MODEL = "base"  # Whisper model for transcription

# Load YouTube links from the file
def load_video_links(filename=VIDEO_LINKS_FILE):
    with open(filename, 'r') as file:
        video_links = [link.strip() for link in file.readlines() if link.strip()]
    return video_links

# Download audio from YouTube video
def download_audio(url, output_name):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [
            {
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }
        ],
        'outtmpl': output_name,  # Filename with .mp3 extension
        'ffmpeg_location': '/usr/bin/ffmpeg'
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return f"{output_name}.mp3"

# Transcribe audio using Whisper
def transcribe_audio_with_whisper(audio_path, model_name=WHISPER_MODEL):
    model = load_model(model_name)
    result = model.transcribe(audio_path)
    return result["text"]

# ReAct Tool for downloading and transcribing all videos
# Split transcriptions into chunks below the token limit - Process in Batches!
def chunk_text(text, max_length=1500):
    words = text.split()
    chunks = []
    chunk = []
    for word in words:
        chunk.append(word)
        if len(" ".join(chunk)) >= max_length:
            chunks.append(" ".join(chunk))
            chunk = []
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

# Process each transcription in chunks
def transcribe_all_videos(input_data=None): # Accept an optional argument
    video_links = load_video_links(VIDEO_LINKS_FILE)
    all_transcriptions = []
    
    for i, video_url in enumerate(video_links):
        print(f"Processing video {i+1}/{len(video_links)}: {video_url}")
        audio_path = download_audio(video_url, AUDIO_OUTPUT_TEMPLATE.format(i))
        full_transcription = transcribe_audio_with_whisper(audio_path)
        transcription_chunks = chunk_text(full_transcription)  # Divide transcription
        
        for j, chunk in enumerate(transcription_chunks):
            # print(f"Processing chunk {j+1} of video {i+1}")
            # Send chunk to the ReAct agent
            # response = react_agent({"input": chunk}) /* Don't call Agent each time */
            all_transcriptions.append({"title": f"Video {i+1} - Chunk {j+1}", "transcription": chunk})
            print(f"Completed chunk {j+1} for video {i+1}")
        
    return all_transcriptions


def transcribe_all_videos(input_data=None):  # Accept an optional argument
    video_links = load_video_links(VIDEO_LINKS_FILE)
    all_transcriptions = []
    for i, video_url in enumerate(video_links):
        print(f"Processing video {i+1}/{len(video_links)}: {video_url}")
        audio_path = download_audio(video_url, AUDIO_OUTPUT_TEMPLATE.format(i))
        transcription = transcribe_audio_with_whisper(audio_path)
        all_transcriptions.append({"title": f"Video {i+1}", "transcription": transcription})
        print(f"Completed transcription for video {i+1}")
    return all_transcriptions

# Define structured output for better organization
response_schema = [
    ResponseSchema(name="transcription", description="Transcribed text from each video in structured format.")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schema)

# Define tool for the agent
transcription_tool = Tool(
    name="TranscribeAllVideos",
    func=transcribe_all_videos,
    description="Transcribes all videos in the provided list and consolidates results."
)

# Initialize the ReAct agent with transcription tool
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
memory = ConversationBufferMemory()

react_agent = initialize_agent(
    tools=[transcription_tool],
    agent="zero-shot-react-description",
    llm=chat_model,
    memory=memory,
    verbose=True
)

# Run agent to perform transcription
def perform_transcription_task():
    user_query = "Transcribe all earnings podcasts from the provided YouTube links."
    transcriptions = transcribe_all_videos()  # Get transcriptions directly
    response = react_agent({"input": user_query, "transcriptions": transcriptions})  # Send entire list to agent once
    print("Agent's Transcription:", response["output"])

def perform_transcription_task():
    user_query = "Transcribe all earnings podcasts from the provided YouTube links."
    response = react_agent({"input": user_query})
    print("Agent's Transcription:", response["output"])

# Start the transcription task
perform_transcription_task()
 """

  chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])
  memory = ConversationBufferMemory()
  react_agent = initialize_agent(


Processing video 1/4: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] Extracting URL: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] 089JqnNmJq0: Downloading webpage
[youtube] 089JqnNmJq0: Downloading ios player API JSON
[youtube] 089JqnNmJq0: Downloading mweb player API JSON
[youtube] 089JqnNmJq0: Downloading player 4e23410d
[youtube] 089JqnNmJq0: Downloading m3u8 information
[info] 089JqnNmJq0: Downloading 1 format(s): 251
[download] Destination: audio_0
[download] 100% of   49.02MiB in 00:00:01 at 43.39MiB/s    
[ExtractAudio] Destination: audio_0.mp3
Deleting original file audio_0 (pass -k to keep)


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 181MiB/s]


Processing chunk 1 of video 1
Completed chunk 1 for video 1
Processing chunk 2 of video 1
Completed chunk 2 for video 1
Processing chunk 3 of video 1
Completed chunk 3 for video 1
Processing chunk 4 of video 1
Completed chunk 4 for video 1
Processing chunk 5 of video 1
Completed chunk 5 for video 1
Processing chunk 6 of video 1
Completed chunk 6 for video 1
Processing chunk 7 of video 1
Completed chunk 7 for video 1
Processing chunk 8 of video 1
Completed chunk 8 for video 1
Processing chunk 9 of video 1
Completed chunk 9 for video 1
Processing chunk 10 of video 1
Completed chunk 10 for video 1
Processing chunk 11 of video 1
Completed chunk 11 for video 1
Processing chunk 12 of video 1
Completed chunk 12 for video 1
Processing chunk 13 of video 1
Completed chunk 13 for video 1
Processing chunk 14 of video 1
Completed chunk 14 for video 1
Processing chunk 15 of video 1
Completed chunk 15 for video 1
Processing chunk 16 of video 1
Completed chunk 16 for video 1
Processing chunk 17 of vid

  response = react_agent({"input": user_query, "transcriptions": transcriptions})  # Send entire list to agent once


[32;1m[1;3mI should transcribe all the earnings podcasts from the YouTube links provided to consolidate the results.
Action: TranscribeAllVideos
Action Input: List of YouTube links to earnings podcasts[0mProcessing video 1/4: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] Extracting URL: https://www.youtube.com/watch?v=089JqnNmJq0
[youtube] 089JqnNmJq0: Downloading webpage
[youtube] 089JqnNmJq0: Downloading ios player API JSON
[youtube] 089JqnNmJq0: Downloading mweb player API JSON
[youtube] 089JqnNmJq0: Downloading m3u8 information
[info] 089JqnNmJq0: Downloading 1 format(s): 251
[download] Destination: audio_0
[download] 100% of   49.02MiB in 00:00:01 at 43.23MiB/s    
[ExtractAudio] Destination: audio_0.mp3
Deleting original file audio_0 (pass -k to keep)
Processing chunk 1 of video 1
Completed chunk 1 for video 1
Processing chunk 2 of video 1
Completed chunk 2 for video 1
Processing chunk 3 of video 1
Completed chunk 3 for video 1
Processing chunk 4 of video 1
Completed c

KeyboardInterrupt: 