In [1]:
import pymongo
from dotenv import load_dotenv
import os

from utils.preprocess_transcription import remove_ads, identify_host, insert_marker_before_host
from utils.extract_comp_guest import re_extract_comp_guest

from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.chat_models import ChatAnthropic
from langchain_openai import ChatOpenAI, OpenAI
from langchain_mistralai.chat_models import ChatMistralAI


from langchain.prompts import PromptTemplate
# from langchain.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser, PydanticOutputParser
import os
load_dotenv()

os.environ['LANGCHAIN_TRACING_V2'] = 'true'

In [2]:
client = pymongo.MongoClient(os.getenv('mdb_uri'))
DB_NAME = "hibt_prod_db"
COLLECTION_NAME = "hibt_prod_collection"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"
MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]
MONGODB_ANSWER_COLLECTION = client[DB_NAME]["hibt_answer_collection"]

In [3]:
doc_grading_prompt = """ You are evaluating a podcast transcript segment to determine if it contains a discussion relevant to a specific question. Your task is to assess whether the segment includes both a question from the host and an answer from a guest that relate closely to the provided relevant question.

Transcript segment:
{transcript}

Relevant question to compare:
{question}

Your job is to decide if the segment discusses a question and answer similar to the relevant question provided. You need to give a binary score indicating the relevance.

- If the segment contains a question and answer that closely match the relevant question, grade it as 'yes'.
- If the segment does not contain a question and answer that are relevant, grade it as 'no'.

Provide your assessment in the following JSON format, with only the 'score' key and your binary decision as the value. No preamble or explanation needed.

"""

In [4]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [5]:
# llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
# llm = llm.with_structured_output(schema={"score": "str"}, method="json_mode")
# llm = OpenAI(model="gpt-3.5-turbo-0125", response_format={ "type": "json_object" })
# llm = ChatGoogleGenerativeAI(model="gemini-pro")
llm = ChatAnthropic(model="claude-instant-1.2")
prompt = PromptTemplate(
    template=doc_grading_prompt,
    input_variables=["transcript", "question"]
)
chain = prompt | llm | JsonOutputParser()
question = "How much of your success do you think is because of luck and, and how much because of Either a hardworking skill."
results = MONGODB_COLLECTION.aggregate([
        {"$vectorSearch": {
            "index": ATLAS_VECTOR_SEARCH_INDEX_NAME,
            "path": "embedding",
            "queryVector": embeddings.embed_query(question),
            "numCandidates": 100,
            "limit": 3,
            "filter": {"company": "ActOne Group"}
        }}
    ])
for doc in results:
    score = chain.invoke({"question":question, "transcript":doc["transcript"]})
    print(score)
    print(doc["transcript"])
    print("-------------------" *10)


{'score': 'yes'}

0 (7m 25s):
When you were, when you were little, did you, would you dream about what you would be when you grew up or did you already have big ambitions as a kid or, or were they kind of like, yeah. You know, I'll, I'll see what happens

1 (7m 40s):
Now. See, print media was really the thing that taught me so much and books, print, period. My mom bought encyclopedias and then she paid them off over time. She also had periodicals that came to our house. There was a magazine called sippy. There was Ebony was popular. Jet magazine was popular back then. And so I knew a lot about the world and I knew a lot about what black people were doing. And so I had role models who, a fulfilled thoughts about what I might do when I grew up. But as a very young child, I remember admiring my mom so much, which I continue to do to this day.

1 (8m 21s):
She, even then I could sense, even though I couldn't put label too, it was quite an efficient person. And she ran our home like a busin

## Graph State

In [6]:
from typing import Annotated, Dict, TypedDict

from langchain_core.messages import BaseMessage

class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        keys: A dictionary where each key is a string
    """
    keys: Dict[str, any]
    

In [7]:
def retrieve(state):
    """
    Retrieve documents

    Args:
        state (dict): The state of the graph
    
    Returns:
        state (dict): New key added to state, documents, that contains retrieved documents
    """
    print("---RETRIEVE---")
    state_dict = state["keys"]
    question = state_dict["question"]
    company = state_dict["company"]
    documents = MONGODB_COLLECTION.aggregate([
        {"$vectorSearch": {
            "index": ATLAS_VECTOR_SEARCH_INDEX_NAME,
            "path": "embedding",
            "queryVector": embeddings.embed_query(question),
            "numCandidates": 100,
            "limit": 3,
            "filter": {"company": f"{company}"}
        }}
    ])
    return {"keys": {"documents": documents, "question": question}}


In [8]:
from pydantic import BaseModel, Field

class QuestionAnswer(BaseModel):
    # question: str = Field(description="Question that the host asked the guest")
    answer: str = Field(description="The guest's answer to the host's question")


def generate(state):
    """
    Generate answer

    Args:
        state (dict): The state of the graph

    Returns:
        state (dict): New key added to state, generation, that contains generation
    
    """
    print("---GENERATE---")
    state_dict = state["keys"]
    documents = state_dict["documents"]
    question = state_dict["question"]

    ### everything below this is a placeholder right now, need to come back to this after the grading process

    prompt = """
    You are tasked with analyzing podcast transcripts to extract discussions that are specifically related to a question posed by the host. Here is the question of interest:

    {question}

    Your objective is to find and extract the question that the host asked and the guest's answer that directly responds to this question. 

    Guidelines for the task:
    - Identify the guest's answer that pertains to the question above. The relevant answer is typically located in the same or subsequent paragraphs following the question.
    - Correct common transcription misinterpretations as you extract the guest's response.
    - Exclude the original question from your output. Provide only the guest's answer, ensuring it is word-for-word, with necessary corrections for transcription errors.
    - If a relevant question and answer pair is not found within the provided transcript segment, simply state "Question/Answer not found."

    Please concentrate on extracting the answer from the provided transcript segment below, with attention to detail and accuracy:

    {transcript}

    Respond in the following JSON format.

    {response_template}

    """
    parser = PydanticOutputParser(pydantic_object=QuestionAnswer)

    prompt = PromptTemplate(
    template=prompt,
    input_variables=["transcript", "question"],
    partial_variables={"response_template": parser.get_format_instructions()}
    )
    
    # llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
    # llm = ChatOpenAI(model="gpt-4-0125-preview")
    # llm = ChatMistralAI(
    #     mistral_api_key=os.getenv('MISTRAL_API_KEY'), temperature=0, model="mistral-medium"
    #     )
    llm = ChatOpenAI(api_key=os.getenv("PPLX_API_KEY"), base_url="https://api.perplexity.ai", model='mixtral-8x7b-instruct')

    # Post-processing
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Chain
    rag_chain = prompt | llm | parser

    # Run
    generation = rag_chain.invoke({"transcript": documents, "question": question})

    if generation == "Question/Answer not found.":
        llm = ChatAnthropic(model="claude-instant-1.2")
        rag_chain = prompt | llm | StrOutputParser()
        generation = rag_chain.invoke({"transcript": documents, "question": question})
    
    return {
        "keys": {"documents": documents, "question": question, "generation": generation}
    }


In [9]:
def grade_documents(state):
    """
    Determines whether the retrieved documents are relevant to the question.

    Args: 
        state (dict): The state of the graph
    
        Returns:
        state (dict): Updates documents key with relevant documents
    """
    print("---CHECK RELEVANCE---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]

    doc_grading_prompt = """ You are evaluating a podcast transcript segment to determine if it contains a discussion relevant to a specific question. Your task is to assess whether the segment includes both a question from the host and an answer from a guest that relate closely to the provided relevant question.
    Transcript segment:
    {transcript}
    Relevant question to compare:
    {question}
    Your job is to decide if the segment discusses a question and answer similar to the relevant question provided. You need to give a binary score indicating the relevance.
    - If the segment contains a question and answer that closely match the relevant question, grade it as 'yes'.
    - If the segment does not contain a question and answer that are relevant, grade it as 'no'.
    Provide your assessment in the following JSON format, with only the 'score' key and your binary decision as the value. No preamble or explanation needed.
    """

    prompt = PromptTemplate(
    template=doc_grading_prompt,
    input_variables=["transcript", "question"]
    )
    llm = ChatAnthropic(model="claude-instant-1.2")
    chain = prompt | llm | JsonOutputParser()

    relevant_documents = []
    reserach = "No"
    for doc in documents:
        score = chain.invoke({"question":question, "transcript":doc["transcript"]})
        if score["score"] == "yes":
            relevant_documents.append(doc)
            print("---GRADE: DOCUMENT RELEVANT---")
        else:
            print("---GRADE: DOCUMENT NOT RELEVANT---")
            continue
    
    if len(relevant_documents) == 0:
        reserach = "Yes"
        print("---GRADE: NO DOCUMENTS RELEVANT---")
    
    return {"keys": {"documents": relevant_documents, 
                     "question": question,
                    "research": reserach
                    }
            }

In [10]:
def transform_query(state):
    """
    Transform the query to produce a better question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates question key with a re-phrased question
    """

    print("---TRANSFORM QUERY---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]

    # Create a prompt template with format instructions and the query
    prompt = PromptTemplate(
        template="""You are generating questions that is well optimized for retrieval. \n
        Look at the input and try to reason about the underlying sematic intent / meaning. \n
        Here is the initial question:
        \n ------- \n
        {question}
        \n ------- \n
        Formulate an improved question: """,
        input_variables=["question"],
    )

    # Grader
    # LLM
    llm = ChatMistralAI(
        mistral_api_key=os.getenv('MISTRAL_API_KEY'), temperature=0, model="mistral-medium"
        )

    # Prompt
    chain = prompt | llm | StrOutputParser()
    better_question = chain.invoke({"question": question})

    return {
        "keys": {"documents": documents, "question": better_question}
    }

In [11]:
def decide_to_generate(state):
    """
    Determines whether to generate an answer or re-generate a question for web search.

    Args:
        state (dict): The current state of the agent, including all keys.

    Returns:
        str: Next node to call
    """

    print("---DECIDE TO GENERATE---")
    state_dict = state["keys"]
    question = state_dict["question"]
    filtered_documents = state_dict["documents"]
    research = state_dict["research"]

    if research == "Yes":
        # All documents have been filtered check_relevance
        # We will re-generate a new query
        print("---DECISION: TRANSFORM QUERY and RESEARCH---")
        return "transform_query"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE---")
        return "generate"

## Build Graph

This just follows the flow we outlined in the figure above.

In [12]:
import pprint

from langgraph.graph import END, StateGraph

workflow = StateGraph(GraphState)

# Define the nodes
workflow.add_node("retrieve", retrieve)  # retrieve
workflow.add_node("grade_documents", grade_documents)  # grade documents
workflow.add_node("generate", generate)  # generatae
workflow.add_node("transform_query", transform_query)  # transform_query
# workflow.add_node("web_search", web_search)  # web search

# Build graph
workflow.set_entry_point("retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "transform_query": "transform_query",
        "generate": "generate",
    },
)
workflow.add_edge("transform_query", "retrieve")
workflow.add_edge("generate", END)

# Compile
app = workflow.compile()

In [13]:
# # Run
# inputs = {
#     "keys": {
#         "company": "ActOne Group",
#         "question": "In reflecting on your path to success, how do you view the contributions of diligence, intellect, and fortuitous events? Are there particular moments or decisions that highlight the importance of these elements?"
#     }
# }
# for output in app.stream(inputs):
#     for key, value in output.items():
#         # Node
#         pprint.pprint(f"Node '{key}':")
#         # Optional: print full state at each node
#         # pprint.pprint(value["keys"], indent=2, width=80, depth=None)
#     pprint.pprint("\n---\n")

# # Final generation
# pprint.pprint(value["keys"]["generation"])

In [15]:
inputs = {
    "keys": {
        "company": "Airbnb",
        "question": "In reflecting on your path to success, how do you view the contributions of diligence, intellect, and fortuitous events? Are there particular moments or decisions that highlight the importance of these elements?"
    }
}

answer = app.invoke(inputs)


---RETRIEVE---
---CHECK RELEVANCE---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: NO DOCUMENTS RELEVANT---
---DECIDE TO GENERATE---
---DECISION: TRANSFORM QUERY and RESEARCH---
---TRANSFORM QUERY---


AttributeError: 'ChatMessage' object has no attribute 'model_dump'

In [None]:
answer['keys']['generation']

QuestionAnswer(answer="I don't think luck had anything to do with it. I do believe that I've been blessed and I have received those blessings by honoring them with hard work. All of the challenges, all of the people, all of the clients and applicants, my life has been a collitus topic opportunity.")

In [15]:
client = pymongo.MongoClient(os.getenv('mdb_uri'))
DB_NAME = "hibt_prod_db"
COLLECTION_NAME = "hibt_prod_collection"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"
MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

print(f'---question--- {question}')
documents = MONGODB_COLLECTION.aggregate([
    {"$vectorSearch": {
        "index": ATLAS_VECTOR_SEARCH_INDEX_NAME,
        "path": "embedding",
        "queryVector": embeddings.embed_query(question),
        "numCandidates": 100,
        "limit": 4,
        "filter": {"company": "Wikipedia"}
    }}
])

---question--- When you think about the success of what you created, how much do you think it has to do with, with how hard you worked and your intelligence and how much do you think it had to do with luck?


In [16]:
documents = list(documents)

In [20]:
documents[0]['transcript']

"\n2 (35m 30s):\nyears. When you think about this thing that you built and your role in the history of the internet how much of the success of Wikipedia do you think it's because of your Brilliance and your hard work? And how much do you think is simply X of luck\n\n1 (35m 48s):\na huge amount due to luck Brilliance and hard work? Okay, maybe not so much. Do you think a component of the success of Wikipedia is that I'm a very friendly and nice person and I'm very laid-back. And so therefore I was able to work in a community environment where people basically yell at you and you just have to kind of roll with it and you're in some sense a leader, but you can't tell anyone what to do their volunteers. So you have to work with love and reason to kind of move people along in a useful way. So I do think that I'm not irrelevant to the process. But I also think that you know, the community is amazing and the luck of the timing of really hitting that moment when it was possible to build\n\n"