In [1]:
import pymongo
from dotenv import load_dotenv
import os

from utils.preprocess_transcription import *
from utils.extract_comp_guest import *

from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings

load_dotenv()

True

In [50]:
client = pymongo.MongoClient(os.getenv('mongodb_uri'))
DB_NAME = "hibt_transcripts"
COLLECTION_NAME = "hibt_collection"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "openai_large_index"
MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

In [3]:


# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# embedded_chunk = embeddings.embed_query(transcript_chunks[0])

In [52]:
def transcript_pipeline(path:str, filename: str):
    with open(path + filename, 'r') as file:
        transcript = file.read()

    transcript = remove_ads(transcript)
    (host, match) = identify_host(transcript)
    transcript = insert_marker_before_host(transcript, host)
    transcript_chunks = transcript.split('###')
    # embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    meta_data = re_extract_comp_guest(filename)
    meta_data['org_filename'] = filename
    for i, doc in enumerate(transcript_chunks):
        if doc.strip():
            document = meta_data.copy()
            document['transcript'] = doc
            document['embedding'] = embeddings.embed_query(doc)
            document['trans_chunk_num'] = i
            MONGODB_COLLECTION.insert_one(document)
    
    return None

In [37]:
# I need to empty this mongodb collection MONGODB_COLLECTION
# MONGODB_COLLECTION.delete_many({})

DeleteResult({'n': 972, 'electionId': ObjectId('7fffffff0000000000000116'), 'opTime': {'ts': Timestamp(1707906755, 996), 't': 278}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1707906755, 998), 'signature': {'hash': b'\x8e\x8f7h\xb1\xae\xf0\x84jL\xe3\xc5H\xd4\x02`\xf7fb\xbd', 'keyId': 7297236172768542722}}, 'operationTime': Timestamp(1707906755, 996)}, acknowledged=True)

In [53]:
# ### The code below is to loop through the files in hibt_test and insert them into the mongodb collection

# # for each file in hibt_test apply transcript pipeline to it
# path = 'podscribe_transcription/hibt_test/'
# for file in os.listdir(path):
#     if file.endswith(".txt"):
#         print(path, file)
#         transcript_pipeline(path, file)


In [54]:
from tqdm import tqdm
import concurrent.futures
path = 'podscribe_transcription/hibt_main/'
num_threads = 8  # or 5, depending on your requirement

successful_files = []
unsuccessful_files = []

# Get the total number of files
total_files = sum(1 for file in os.listdir(path) if file.endswith(".txt"))

with tqdm(total=total_files, desc="Processing files") as pbar:
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        for file in os.listdir(path):
            if file.endswith(".txt"):
                print(path, file)
                try:
                    executor.submit(transcript_pipeline, path, file)
                    successful_files.append(file)
                except:
                    unsuccessful_files.append(file)
                pbar.update(1)


Processing files:   0%|          | 0/258 [00:00<?, ?it/s]

podscribe_transcription/hibt_main/ Siete Family Foods_ Miguel and Veronica Garza-transcript.txt
podscribe_transcription/hibt_main/ Literally unearthing a climate solution with Cody Finke of Brimstone-transcript.txt
podscribe_transcription/hibt_main/ Tatcha_ Vicky Tsai-transcript.txt
podscribe_transcription/hibt_main/ Orangetheory Fitness_ Ellen Latham-transcript.txt
podscribe_transcription/hibt_main/ Mailchimp_ Ben Chestnut-transcript.txt
podscribe_transcription/hibt_main/ A biometric smart gun with Kai Kloepfer of Biofire-transcript.txt
podscribe_transcription/hibt_main/ Dude Perfect_ Cory Cotton and Tyler Toney-transcript.txt
podscribe_transcription/hibt_main/ Video Artist_ Casey Neistat-transcript.txt
podscribe_transcription/hibt_main/ Rent The Runway_ Jenn Hyman (2018)-transcript.txt
podscribe_transcription/hibt_main/ reCAPTCHA and Duolingo_ Luis von Ahn-transcript.txt
podscribe_transcription/hibt_main/ Herschel Supply Co._ Jamie and Lyndon Cormack-transcript.txt
podscribe_transcri

Processing files: 100%|██████████| 258/258 [1:00:40<00:00, 14.11s/it]


### Search for the Answers for each company

In [55]:
client = pymongo.MongoClient(os.getenv('mongodb_uri'))
DB_NAME = "hibt_transcripts"
COLLECTION_NAME = "hibt_collection"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "main_hibt_index"
MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]
MONGODB_ANSWER_COLLECTION = client[DB_NAME]["hibt_answer_collection"]

In [73]:
###
hard_luck_prompt = """
    You are a transcription assistant. You are given portions of a transcription in order to extract the guests answer.
    We are working with transcriptions from the podcast "How I Built This" where the host Guy Raz interviews entrepreneurs. 
    Typically towards the end of the interview he will ask a question regarding to what extent the entrepreneur attributes their success to skill, hard work, and/or intelligence versus luck.
    Your task is to extract the answer from the transcriptions. Remember that these are transcriptions so words can be mispelled like "lock" instead of "luck" or "intelligents" instead of "intelligence".

    The answer is typically a few sentences long and can be found in the same paragraph or in the following paragraphs depending on their thoughts.
    If you don't see a relevant question and answer in the transcript, respond with "Question/Answer not found."

    <Examples>
    
    Question: When you think about the success of what you created, how much do you think it has to do with, with how hard you worked and your intelligence and how much do you think it had to do with luck?
    Answer: It's probably 50 50. Look, I, I actually, so I think every CEO who I've met, they have some minimum level of intelligence, right. But they're oftentimes not the smartest person in the room. And I, I would put myself in that category too. Like, I'm, you know, I met some minimum bar, but I'm not the smartest person. What I am is probably quite determined. There is a part of my personality. This might be a little bit like that being on the spectrum thing or whatever, but it's just like, whether it's like a really good day or a really bad day, I'm sort of even keeled. And I'm often just like, okay, well what's the next step? How do we fix it? And let's keep making progress even over like a decade or more. I can be quite resilient in that regard. But I guess the other thing I would say just about luck, absolutely luck played a big role in Coinbase. I mean, there were certain moments early on in our history where if a coin had flipped the other way we might not be here. I think we got very lucky with sort of the timing of when the company decided to launch and when crypto decided to grow, I think that I kind of just kept trying things until eventually one of them hit. Right. Which is, it's a way of unifying those two ideas. It's, it's definitely a lot of luck when it works. But if you just keep trying, you can kind of make your own luck.

    Question: Do you, I mean, obviously worked extremely hard making this happen, but I think you did have some incredibly fortunate breaks that you'd like, you couldn't control like male camp. Do you, are, are you a believer in luck or do you think that what happened really has to do with just how strategic and smart you guys were?
    Answer: I totally believe in luck. I mean, but I think it comes from constantly striving and looking for that silver bullet. And when I look back on my 20 years, it felt like I was always looking for the answer, always looking for what is the playbook that I'm missing? You know, it's constantly searching. And then all along the way, stumbling upon something that actually does better than my wildest dreams. I don't know. It, it, I, I, I always had in my mind that I was just a bumbling idiot from of a Georgia who stumbled into business. And somewhere out there, someone knew the right answer of how to run my business. Right. And these days, these days, you know, MailChimp is much bigger. And I get invited to ritzy events with some really great business leaders. And I remember being in one recently where a head of one of Microsoft's cloud units, he was, he was speaking about his, how he hosts executive leadership meetings. And he says, we're still winging it. We have no idea. You know, he didn't say it that way, but he's like, we're still experimenting. You know? And this is Microsoft. And they've been around a very, very long time and they know what they're doing. So, you know, it was that moment. I realized, oh my God, nobody has the answer. We're all just winging it. Every one of us is just winging it. I don't know if that's hopeful or horrifying, but that's what I've learned.
    
    Question ...do you think it has more to do with just your intelligence and your skill or do you think that luck was a bigger factor in everything that's happened to you
    Answer: so the thing I always say about lock I think it is it's the lottery and so you get sort of lottery tickets for how you know it's like how good your idea is how good your team is and how hard you work and so you take those like factors and I think how hard you work in your team or probably even bigger factors than the the idea but you still need to have something that actually would work and so you take those three things together and you get for each sort of our of production or work you get a lottery ticket and so just like the lottery if you unless you own every ticket you're not guaranteed to win because I've seen people that have are smarter than me better than me better idea than me fail because of things like luck like things not going their way and I've seen again the opposite of that and so we have been fortunate to have been have lots of Lucky breaks in our Direction but we also continue to work to get more tickets so that we could win the lottery
    </Examples>

    Transcript to extract answers from:
    <transcript>
    {transcript}
    </transcript>

    Only respond with the answer to the question.
    """

In [74]:
from langchain_openai import ChatOpenAI
load_dotenv()

True

In [75]:
def extract_answer(transcript: str) -> str:

    prompt_template = PromptTemplate(
            template=hard_luck_prompt,
            input_variables=["transcript"],
        )
    formatted_input = prompt_template.format_prompt(transcript=transcript)

    # llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=os.getenv("GOOGLE_API_KEY"), safety_setting=safe)

    # llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
    llm = ChatOpenAI(model="gpt-4-0125-preview")

    answer = llm.invoke(formatted_input.to_string())
    
    return answer

In [76]:
# # Find all unique values for the 'company' field
# unique_companies = MONGODB_COLLECTION.distinct('company')

# # Print all unique companies
# for company in unique_companies:
#     query = "how much of your success do you attribute to how hard you worked and your intelligence and how much do you think it had to do with luck?"

#     results = MONGODB_COLLECTION.aggregate([
#     {"$vectorSearch": {
#         "index": "hibt_qa_index",
#         "path": "embedding",
#         "queryVector": embeddings.embed_query(query),
#         "numCandidates": 50,
#         "limit": 3,
#         "filter": {"company": company}
#     }}
#     ])
#     print(company)
#     transcript = ""
#     for doc in results:
#         chunk_num = doc['trans_chunk_num']
#         org_filename = doc['org_filename']
#         # get the next chunk
#         next_chunk = MONGODB_COLLECTION.find_one({"org_filename": org_filename, "trans_chunk_num": chunk_num + 1})
#         # add both transcripts to the transcript string
#         transcript += doc['transcript'] + "\n" + next_chunk['transcript']
#     extracted_answer = extract_answer(transcript)
#     print("Extracted Answer")
#     print(extracted_answer)
#     # add the extracted answer and meta data into a document and add it to a new collection
    
#     print("\n")

In [77]:
# client = pymongo.MongoClient(os.getenv('mongodb_uri'))
# DB_NAME = "hibt_transcripts"
# COLLECTION_NAME = "hibt_test_openai_collection"
# ATLAS_VECTOR_SEARCH_INDEX_NAME = "openai_vector_index"
# MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

# MONGODB_ANSWER_COLLECTION = client[DB_NAME]["hibt_test_openai_answer_collection"]

In [78]:
# Find all unique values for the 'company' field
unique_companies = MONGODB_COLLECTION.distinct('company')

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Print all unique companies
for company in unique_companies:
    query = "how much of your success do you attribute to how hard you worked, skill, and/or your intelligence and how much do you think it had to do with luck?"

    results = MONGODB_COLLECTION.aggregate([
    {"$vectorSearch": {
        "index": ATLAS_VECTOR_SEARCH_INDEX_NAME,
        "path": "embedding",
        "queryVector": embeddings.embed_query(query),
        "numCandidates": 100,
        "limit": 5,
        "filter": {"company": company}
    }}
    ])
    print(company)
    # if results is empty then continue
    if results:
        transcript = ""
        for i, doc in enumerate(results):
            chunk_num = doc['trans_chunk_num']
            org_filename = doc['org_filename']
            guest = doc['guest']
            transcript += doc['transcript']
            # try:
            #     next_chunk = MONGODB_COLLECTION.find_one({"org_filename": org_filename, "trans_chunk_num": chunk_num + 1})
            #     transcript += next_chunk['transcript']
            # except:
            #     continue
        extracted_answer = extract_answer(transcript)

        document = {
            "company": company,
            "guest": guest,
            "org_filename": org_filename,
            "answer": extracted_answer.content
        }
        MONGODB_ANSWER_COLLECTION.insert_one(document)
        print(company, guest,"/n", extracted_answer, "/n")
    else:
        print("No results found for", company)

AMP Robotics
AMP Robotics Matanya Horowitz /n content='Question/Answer not found.' /n
ARRAY
ARRAY Filmmaker Ava DuVernay /n content="I guess I, I attribute it to luck, but I don't call it luck. You know, I think it's, it's my path and it was, these are blessings. These are, this is designed and it's up to me to work, to live up to these moments that are put in front of me, but certainly in the way that you're defining it, I would say luck. Yeah, work really hard, but you could work really hard and never get the breaks. You know, work really hard and never be seen, be heard. You can work really hard and you know, no one reads your scripts on a plane and calls you the next day and says, look, let me be in it. Who ends up being the same guy who gets you Selma, get that gets nominated for the Oscar that does today.\n\nYou could be, you could make 13th. And you know, it sits on the back channels of Netflix and doesn't really get seen or move, but it moved. You can, you can make, when they s

In [70]:
# check how many documents are in MONGODB_ANSWER_COLLECTION
MONGODB_ANSWER_COLLECTION.count_documents({})

236

In [71]:
# check how many documents in MONGODB_ANSWER_COLLECTION has the answer "Question/Answer not found."
MONGODB_ANSWER_COLLECTION.count_documents({"answer": "Question/Answer not found."})

72

In [72]:
# I need to empty this mongodb collection MONGODB_COLLECTION
MONGODB_ANSWER_COLLECTION.delete_many({})

DeleteResult({'n': 236, 'electionId': ObjectId('7fffffff0000000000000117'), 'opTime': {'ts': Timestamp(1708139348, 68), 't': 279}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1708139348, 68), 'signature': {'hash': b')i#}G\x8a\xc9K\xb9\xcc$n2\xc3Z\x9a\xf9\xc3|\xac', 'keyId': 7297236172768542722}}, 'operationTime': Timestamp(1708139348, 68)}, acknowledged=True)

In [8]:
client = pymongo.MongoClient(os.getenv('mongodb_uri'))
DB_NAME = "hibt_transcripts"
COLLECTION_NAME = "hibt_test_collection"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "hibt_test_index"
MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

In [9]:
# MONGODB_COLLECTION.delete_many({"$expr": {"$ne": [{"$size": "$embedding"}, 768]}})


DeleteResult({'n': 20, 'electionId': ObjectId('7fffffff0000000000000116'), 'opTime': {'ts': Timestamp(1707905272, 26), 't': 278}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1707905272, 26), 'signature': {'hash': b'\x9c\x92cT\x081\xf2\xc03M\xa1\xdc\xe8\xdd\x82\xf6\xa7\x8aX\t', 'keyId': 7297236172768542722}}, 'operationTime': Timestamp(1707905272, 26)}, acknowledged=True)