In [6]:
import pymongo
from dotenv import load_dotenv
import os

from utils.preprocess_transcription import *
from utils.extract_comp_guest import *


In [14]:
client = pymongo.MongoClient(os.getenv('mongodb_uri'))
DB_NAME = "hibt_transcripts"
COLLECTION_NAME = "hibt_test_collection"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "hibt_test_index"
MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

In [18]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# embedded_chunk = embeddings.embed_query(transcript_chunks[0])

In [19]:
def transcript_pipeline(path:str, filename: str):
    with open(path + filename, 'r') as file:
        transcript = file.read()

    transcript = remove_ads(transcript)
    (host, match) = identify_host(transcript)
    transcript = insert_marker_before_host(transcript, host)
    transcript_chunks = transcript.split(';;;')
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    meta_data = re_extract_comp_guest(filename)
    meta_data['org_filename'] = filename
    for i, doc in enumerate(transcript_chunks):
        if doc.strip():
            document = meta_data.copy()
            document['transcript'] = doc
            document['embedding'] = embeddings.embed_query(doc)
            document['trans_chunk_num'] = i
            MONGODB_COLLECTION.insert_one(document)
    
    return None

In [20]:
### The code below is to loop through the files in hibt_test and insert them into the mongodb collection

# for each file in hibt_test apply transcript pipeline to it
# path = 'podscribe_transcription/hibt_test/'
# for file in os.listdir(path):
#     if file.endswith(".txt"):
#         print(path, file)
#         transcript_pipeline(path, file)

# remaining = ["Twitch_ Emmett Shear-transcript.txt",
#              "Robinhood_ Vlad Tenev-transcript.txt"]
# path = "podscribe_transcription/hibt_test/"
# for file in remaining:
#     if file.endswith(".txt"):
#         print(path, file)
#         transcript_pipeline(path, file)

podscribe_transcription/hibt_test/ Orangetheory Fitness_ Ellen Latham-transcript.txt
podscribe_transcription/hibt_test/ Mailchimp_ Ben Chestnut-transcript.txt
podscribe_transcription/hibt_test/ reCAPTCHA and Duolingo_ Luis von Ahn-transcript.txt
podscribe_transcription/hibt_test/ Fitbit_ James Park-transcript.txt
podscribe_transcription/hibt_test/ Ring_ Jamie Siminoff-transcript.txt
podscribe_transcription/hibt_test/ Patagonia_ Yvon Chouinard-transcript.txt
podscribe_transcription/hibt_test/ Coinbase_ Brian Armstrong-transcript.txt
podscribe_transcription/hibt_test/ Etsy_ Rob Kalin-transcript.txt
podscribe_transcription/hibt_test/ Twitch_ Emmett Shear-transcript.txt
podscribe_transcription/hibt_test/ Robinhood_ Vlad Tenev-transcript.txt


In [26]:
query = "how much of your success do you attribute to hard work and "

results = MONGODB_COLLECTION.aggregate([
  {"$vectorSearch": {
    "index": "hibt_qa_index",
    "path": "embedding",
    "queryVector": embeddings.embed_query(query),
    "numCandidates": 50,
    "limit": 3,
    "filter": {"company": "Coinbase"}
  }}
]);

In [27]:
# for each document in results we need to get the "trans_chunk_num" in order to get the following chunk from the same "org_file_name"
for doc in results:
    chunk_num = doc['trans_chunk_num']
    org_filename = doc['org_filename']
    print(org_filename, chunk_num)
    # get the next chunk
    next_chunk = MONGODB_COLLECTION.find_one({"org_filename": org_filename, "trans_chunk_num": chunk_num + 1})
    print("vector searched document")
    print(doc['transcript'])
    print("next document")
    print(next_chunk['transcript'])

Coinbase_ Brian Armstrong-transcript.txt 76
vector searched document

3 (1h 13m 55s):
All right. So you put this out there and there was a, quite a bit of blowback to say the least certainly in the media and in the social media sphere, first of all, were you expecting that? Yeah.

2 (1h 14m 7s):
Fully expected it. Yeah.


next document

3 (1h 14m 9s):
So tell me, explain this, walk me through it. I mean, many leaders are kind of leaning into trying to, you know, kind of be responsive to employees who, you know, have various, some very legitimate political grievances and, and, but you decided to take an opposite position and to essentially push it away. Can you walk me through your, your thinking and why?

2 (1h 14m 35s):
Yeah, sure. I mean, this was definitely, there was a time period there for quite a while inside Coinbase, where I would say there was this growing tension and unrest, and, but something happened last year, which actually started to take a different tone. And I, I had t

In [28]:
# Find all unique values for the 'company' field
unique_companies = MONGODB_COLLECTION.distinct('company')

# Print all unique companies
for company in unique_companies:
    query = "how much of your success do you attribute to how hard you worked and your intelligence and how much do you think it had to do with luck?"

    results = MONGODB_COLLECTION.aggregate([
    {"$vectorSearch": {
        "index": "hibt_qa_index",
        "path": "embedding",
        "queryVector": embeddings.embed_query(query),
        "numCandidates": 50,
        "limit": 3,
        "filter": {"company": company}
    }}
    ])
    print(company)
    for doc in results:
        chunk_num = doc['trans_chunk_num']
        org_filename = doc['org_filename']
        print(org_filename, chunk_num)
        # get the next chunk
        next_chunk = MONGODB_COLLECTION.find_one({"org_filename": org_filename, "trans_chunk_num": chunk_num + 1})
        print("vector searched document")
        print(doc['transcript'])
        print("next document")
        print(next_chunk['transcript'])
    print("\n")

Coinbase
Coinbase_ Brian Armstrong-transcript.txt 82
vector searched document

3 (1h 22m 59s):
When you think about the success of what you created, how much do you think it has to do with, with how hard you worked and your intelligence and how much do you think it had to do with luck?

2 (1h 23m 10s):
It's probably 50 50. Look, I, I actually, so I think every CEO who I've met, they have some minimum level of intelligence, right. But they're oftentimes not the smartest person in the room. And I, I would put myself in that category too. Like, I'm, you know, I met some minimum bar, but I'm not the smartest person. What I am is probably quite determined. There is a part of my personality. This might be a little bit like that being on the spectrum thing or whatever, but it's just like, whether it's like a really good day or a really bad day, I'm sort of even keeled. And I'm often just like, okay, well what's the next step? How do we fix it? And let's keep making progress even over like a d

In [68]:
###
hard_luck_prompt = """
    You are a transcription assistant. You are given portions of a transcription in order to extract the guests answer.
    We are working with transcriptions from the podcast "How I Built This" where the host Guy Raz interviews entrepreneurs. 
    Typically towards the end of the interview he will ask a question regarding to what extent the entrepreneur attributes their success to hard work and intelligence versus luck.
    Your task is to extract the answer from the transcriptions.

    The answer is typically a few sentences long and can be found in the same paragraph or in the following paragraphs depending on their thoughts.
    If you don't see a relevant question and answer in the transcript, respond with "Question/Answer not found."

    Examples:
    ;;;
    Question: When you think about the success of what you created, how much do you think it has to do with, with how hard you worked and your intelligence and how much do you think it had to do with luck?
    Answer: It's probably 50 50. Look, I, I actually, so I think every CEO who I've met, they have some minimum level of intelligence, right. But they're oftentimes not the smartest person in the room. And I, I would put myself in that category too. Like, I'm, you know, I met some minimum bar, but I'm not the smartest person. What I am is probably quite determined. There is a part of my personality. This might be a little bit like that being on the spectrum thing or whatever, but it's just like, whether it's like a really good day or a really bad day, I'm sort of even keeled. And I'm often just like, okay, well what's the next step? How do we fix it? And let's keep making progress even over like a decade or more. I can be quite resilient in that regard. But I guess the other thing I would say just about luck, absolutely luck played a big role in Coinbase. I mean, there were certain moments early on in our history where if a coin had flipped the other way we might not be here. I think we got very lucky with sort of the timing of when the company decided to launch and when crypto decided to grow, I think that I kind of just kept trying things until eventually one of them hit. Right. Which is, it's a way of unifying those two ideas. It's, it's definitely a lot of luck when it works. But if you just keep trying, you can kind of make your own luck.

    Question: Do you, I mean, obviously worked extremely hard making this happen, but I think you did have some incredibly fortunate breaks that you'd like, you couldn't control like male camp. Do you, are, are you a believer in luck or do you think that what happened really has to do with just how strategic and smart you guys were?
    Answer: I totally believe in luck. I mean, but I think it comes from constantly striving and looking for that silver bullet. And when I look back on my 20 years, it felt like I was always looking for the answer, always looking for what is the playbook that I'm missing? You know, it's constantly searching. And then all along the way, stumbling upon something that actually does better than my wildest dreams. I don't know. It, it, I, I, I always had in my mind that I was just a bumbling idiot from of a Georgia who stumbled into business. And somewhere out there, someone knew the right answer of how to run my business. Right. And these days, these days, you know, MailChimp is much bigger. And I get invited to ritzy events with some really great business leaders. And I remember being in one recently where a head of one of Microsoft's cloud units, he was, he was speaking about his, how he hosts executive leadership meetings. And he says, we're still winging it. We have no idea. You know, he didn't say it that way, but he's like, we're still experimenting. You know? And this is Microsoft. And they've been around a very, very long time and they know what they're doing. So, you know, it was that moment. I realized, oh my God, nobody has the answer. We're all just winging it. Every one of us is just winging it. I don't know if that's hopeful or horrifying, but that's what I've learned.
    ;;;

    Transcript to extract answers from:
    {transcript}
    """

In [69]:
from langchain_openai import ChatOpenAI
load_dotenv()

True

In [70]:
def extract_answer(transcript: str) -> str:

    prompt_template = PromptTemplate(
            template=hard_luck_prompt,
            input_variables=["transcript"],
        )
    formatted_input = prompt_template.format_prompt(transcript=transcript)

    safe = [
            {
                "category": "HARM_CATEGORY_DANGEROUS",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_HARASSMENT",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_HATE_SPEECH",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
                "threshold": "BLOCK_NONE",
            },
        ]

    # llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=os.getenv("GOOGLE_API_KEY"), safety_setting=safe)

    llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

    answer = llm.invoke(formatted_input.to_string())
    
    return answer

In [71]:
# Find all unique values for the 'company' field
unique_companies = MONGODB_COLLECTION.distinct('company')

# Print all unique companies
for company in unique_companies:
    query = "how much of your success do you attribute to how hard you worked and your intelligence and how much do you think it had to do with luck?"

    results = MONGODB_COLLECTION.aggregate([
    {"$vectorSearch": {
        "index": "hibt_qa_index",
        "path": "embedding",
        "queryVector": embeddings.embed_query(query),
        "numCandidates": 50,
        "limit": 3,
        "filter": {"company": company}
    }}
    ])
    print(company)
    transcript = ""
    for doc in results:
        chunk_num = doc['trans_chunk_num']
        org_filename = doc['org_filename']
        print(org_filename, chunk_num)
        # get the next chunk
        next_chunk = MONGODB_COLLECTION.find_one({"org_filename": org_filename, "trans_chunk_num": chunk_num + 1})
        print("vector searched document")
        print(doc['transcript'])
        print("next document")
        print(next_chunk['transcript'])
        # add both transcripts to the transcript string
        transcript += doc['transcript'] + "\n" + next_chunk['transcript']
    extracted_answer = extract_answer(transcript)
    print("Extracted Answer")
    print(extracted_answer)
    
    print("\n")

Coinbase
Coinbase_ Brian Armstrong-transcript.txt 82
vector searched document

3 (1h 22m 59s):
When you think about the success of what you created, how much do you think it has to do with, with how hard you worked and your intelligence and how much do you think it had to do with luck?

2 (1h 23m 10s):
It's probably 50 50. Look, I, I actually, so I think every CEO who I've met, they have some minimum level of intelligence, right. But they're oftentimes not the smartest person in the room. And I, I would put myself in that category too. Like, I'm, you know, I met some minimum bar, but I'm not the smartest person. What I am is probably quite determined. There is a part of my personality. This might be a little bit like that being on the spectrum thing or whatever, but it's just like, whether it's like a really good day or a really bad day, I'm sort of even keeled. And I'm often just like, okay, well what's the next step? How do we fix it? And let's keep making progress even over like a d

In [55]:
extracted_answer.content

"Answer: I mean, it's probably 50 50. Look, I, I actually, so I think every CEO who I've met, they have some minimum level of intelligence, right. But they're oftentimes not the smartest person in the room. And I, I would put myself in that category too. Like, I'm, you know, I met some minimum bar, but I'm not the smartest person. What I am is probably quite determined. There is a part of my personality. This might be a little bit like that being on the spectrum thing or whatever, but it's just like, whether it's like a really good day or a really bad day, I'm sort of even keeled. And I'm often just like, okay, well what's the next step? How do we fix it? And let's keep making progress even over like a decade or more. I can be quite resilient in that regard. But I guess the other thing I would say just about luck, absolutely luck played a big role in Coinbase. I mean, there were certain moments early on in our history where if a coin had flipped the other way we might not be here. I th

In [48]:
client = pymongo.MongoClient(os.getenv('mongodb_uri'))
DB_NAME = "hibt_transcripts"
COLLECTION_NAME = "hibt_test_answer_collection"
MONGODB_ANSWER_COLLECTION = client[DB_NAME][COLLECTION_NAME]


In [61]:
# Find all unique values for the 'company' field
unique_companies = MONGODB_COLLECTION.distinct('company')

# Print all unique companies
for company in unique_companies:
    query = "how much of your success do you attribute to how hard you worked and your intelligence and how much do you think it had to do with luck?"

    results = MONGODB_COLLECTION.aggregate([
    {"$vectorSearch": {
        "index": "hibt_qa_index",
        "path": "embedding",
        "queryVector": embeddings.embed_query(query),
        "numCandidates": 50,
        "limit": 3,
        "filter": {"company": company}
    }}
    ])
    print(company)
    transcript = ""
    for doc in results:
        chunk_num = doc['trans_chunk_num']
        org_filename = doc['org_filename']
        guest = doc['guest']
        next_chunk = MONGODB_COLLECTION.find_one({"org_filename": org_filename, "trans_chunk_num": chunk_num + 1})
        transcript += doc['transcript'] + "\n" + next_chunk['transcript']
    extracted_answer = extract_answer(transcript)

    document = {
        "company": company,
        "guest": guest,
        "org_filename": org_filename,
        "answer": extracted_answer.content
    }
    MONGODB_ANSWER_COLLECTION.insert_one(document)
    print(company, guest,"/n", extracted_answer, "/n")

Coinbase
Coinbase Brian Armstrong /n content='Question/Answer not found' /n
Etsy
Etsy Rob Kalin /n content='Question/Answer not found' /n
Fitbit
Fitbit James Park /n content='Question/Answer not found' /n
Mailchimp
Mailchimp Ben Chestnut /n content="Question: Do you, I mean, obviously worked extremely hard making this happen, but I think you did have some incredibly fortunate breaks that you'd like, you couldn't control like male camp. Do you, are, are you a believer in luck or do you think that what happened really has to do with just how strategic and smart you guys were?\nAnswer: I totally believe in luck. I mean, but I think it comes from constantly striving and looking for that silver bullet. And when I look back on my 20 years, it felt like I was always looking for the answer, always looking for what is the playbook that I'm missing? You know, it's constantly searching. And then all along the way, stumbling upon something that actually does better than my wildest dreams. I don't 

KeyboardInterrupt: 

In [4]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [12]:
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=os.getenv("GOOGLE_API_KEY"))

In [13]:
llm.invoke("What is the meaning of life?")

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised NotFound: 404 models/gemini-ultra is not found for API version v1beta, or is not supported for GenerateContent. Call ListModels to see the list of available models and their supported methods..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised NotFound: 404 models/gemini-ultra is not found for API version v1beta, or is not supported for GenerateContent. Call ListModels to see the list of available models and their supported methods..


KeyboardInterrupt: 

In [62]:
# I need to empty this mongodb collection MONGODB_COLLECTION
MONGODB_ANSWER_COLLECTION.delete_many({})

DeleteResult({'n': 4, 'electionId': ObjectId('7fffffff0000000000000116'), 'opTime': {'ts': Timestamp(1707821978, 80), 't': 278}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1707821978, 80), 'signature': {'hash': b'\x8f\xf9\xdb\xc2\xac\xe0\x83\xb9\x0e8\xb1\xd5\xe3\xbe\x91\x90\xe6Q*G', 'keyId': 7297236172768542722}}, 'operationTime': Timestamp(1707821978, 80)}, acknowledged=True)