In [2]:
# Make the display a bit wider
# from IPython.display import display, HTML
# display(HTML("<style>.container { width:90% !important; }</style>"))

# LangChain basics
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import create_extraction_chain

# Vector Store and retrievals
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma, Pinecone
import pinecone

# Chat Prompt templates for dynamic values
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)

# Supporting libraries
import os
from dotenv import load_dotenv

load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [3]:
# Creating two versions of the model so I can swap between gpt3.5 and gpt4
llm3 = ChatOpenAI(temperature=0,

                  openai_api_key=os.getenv('OPENAI_API_KEY'),
                  model_name="gpt-3.5-turbo-0613",
                  request_timeout = 180
                )

llm4 = ChatOpenAI(temperature=0,
                  openai_api_key=os.getenv('OPENAI_API_KEY'),
                  model_name="gpt-4-0613",
                  request_timeout = 180
                 )

  warn_deprecated(


In [7]:
# I put three prepared transcripts
transcript_paths = [
    './data/Transcripts/MFMPod/mfm_pod_steph.txt',
    './data/Transcripts/MFMPod/mfm_pod_alex.txt',
    './data/Transcripts/MFMPod/mfm_pod_rob.txt'
]

with open('./data/Transcripts/MFMPod/mfm_pod_steph.txt') as file:
    transcript = file.read()

In [8]:
print(transcript[:280])

Shaan Puri (0:00:00-0:00:03): D to C hearing AIDS. I think that's actually going to be a big deal. 

Sam Parr (0:00:03-0:00:05): And they're profitable. 

Shaan Puri (0:00:05-0:00:08): I mean, I'm just turning you on. Yeah, they were. 

Sam Parr (0:00:12-0:00:13): They Mormon. 




In [9]:
# Load up your text splitter
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", " "], chunk_size=10000, chunk_overlap=2200)

# I'm only doing the first 23250 characters. This to save on costs. When you're doing your exercise you can remove this to let all the data through
transcript_subsection_characters = 23250
docs = text_splitter.create_documents([transcript[:transcript_subsection_characters]])
print (f"You have {len(docs)} docs. First doc is {llm3.get_num_tokens(docs[0].page_content)} tokens")

You have 3 docs. First doc is 2801 tokens


In [11]:
template="""
You are a helpful assistant that helps retrieve topics talked about in a podcast transcript
- Your goal is to extract the topic names and brief 1-sentence description of the topic
- Topics include:
  - Themes
  - Business Ideas
  - Interesting Stories
  - Money making businesses
  - Quick stories about people
  - Mental Frameworks
  - Stories about an industry
  - Analogies mentioned
  - Advice or words of caution
  - Pieces of news or current events
- Provide a brief description of the topics after the topic name. Example: 'Topic: Brief Description'
- Use the same words and terminology that is said in the podcast
- Do not respond with anything outside of the podcast. If you don't see any topics, say, 'No Topics'
- Do not respond with numbers, just bullet points
- Do not include anything about 'Marketing Against the Grain'
- Only pull topics from the transcript. Do not use the examples
- Make your titles descriptive but concise. Example: 'Shaan's Experience at Twitch' should be 'Shaan's Interesting Projects At Twitch'
- A topic should be substantial, more than just a one-off comment

% START OF EXAMPLES
 - Sam’s Elisabeth Murdoch Story: Sam got a call from Elizabeth Murdoch when he had just launched The Hustle. She wanted to generate video content.
 - Shaan’s Rupert Murdoch Story: When Shaan was running Blab he was invited to an event organized by Rupert Murdoch during CES in Las Vegas.
 - Revenge Against The Spam Calls: A couple of businesses focused on protecting consumers: RoboCall, TrueCaller, DoNotPay, FitIt
 - Wildcard CEOs vs. Prudent CEOs: However, Munger likes to surround himself with prudent CEO’s and says he would never hire Musk.
 - Chess Business: Priyav, a college student, expressed his doubts on the MFM Facebook group about his Chess training business, mychesstutor.com, making $12.5K MRR with 90 enrolled.
 - Restaurant Refiller: An MFM Facebook group member commented on how they pay AirMark $1,000/month for toilet paper and toilet cover refills for their restaurant. Shaan sees an opportunity here for anyone wanting to compete against AirMark.
 - Collecting: Shaan shared an idea to build a mobile only marketplace for a collectors’ category; similar to what StockX does for premium sneakers.
% END OF EXAMPLES
"""
system_message_prompt_map = SystemMessagePromptTemplate.from_template(template)

human_template="Transcript: {text}" # Simply just pass the text as a human message
human_message_prompt_map = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_map = ChatPromptTemplate.from_messages(messages=[system_message_prompt_map, human_message_prompt_map])

In [12]:
template="""
You are a helpful assistant that helps retrieve topics talked about in a podcast transcript
- You will be given a series of bullet topics of topics vound
- Your goal is to exract the topic names and brief 1-sentence description of the topic
- Deduplicate any bullet points you see
- Only pull topics from the transcript. Do not use the examples

% START OF EXAMPLES
 - Sam’s Elisabeth Murdoch Story: Sam got a call from Elizabeth Murdoch when he had just launched The Hustle. She wanted to generate video content.
 - Shaan’s Rupert Murdoch Story: When Shaan was running Blab he was invited to an event organized by Rupert Murdoch during CES in Las Vegas.
% END OF EXAMPLES
"""
system_message_prompt_map = SystemMessagePromptTemplate.from_template(template)

human_template="Transcript: {text}" # Simply just pass the text as a human message
human_message_prompt_map = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_combine = ChatPromptTemplate.from_messages(messages=[system_message_prompt_map, human_message_prompt_map])

In [13]:
chain = load_summarize_chain(llm4,
                             chain_type="map_reduce",
                             map_prompt=chat_prompt_map,
                             combine_prompt=chat_prompt_combine,
#                              verbose=True
                            )

In [15]:
print (topics_found)

- D to C Hearing Aids: Shaan Puri discusses the potential of direct-to-consumer hearing aids as a profitable business.
- Children's Play Space Business: Shaan Puri talks about a business idea about a children's play space franchise.
- Steph Smith's Career: Sam Parr shares the story of Steph Smith's journey from working at Trends to her current role at Andreessen Horowitz.
- Working at Andreessen Horowitz: Steph Smith shares her experience working at Andreessen Horowitz, discussing the balance between the day-to-day normalcy and the occasional spectacular moments.
- Importance of Office Presence: Shaan Puri suggests the importance of being physically present in the office to seize opportunities.
- Sam's Master Plan at Facebook: Sam advised his wife Sarah to be proactive and visible during meetings at Facebook.
- Shaan's Strategy at Twitch: Shaan shares his experience at Twitch where he focused on being part of interesting meetings and projects.
- Steph's Perception of Andreessen Horowit

In [14]:
topics_found = chain.run({"input_documents": docs})

  warn_deprecated(


In [16]:
schema = {
    "properties": {
        # The title of the topic
        "topic_name": {
            "type": "string",
            "description" : "The title of the topic listed"
        },
        # The description
        "description": {
            "type": "string",
            "description" : "The description of the topic listed"
        },
        "tag": {
            "type": "string",
            "description" : "The type of content being described",
            "enum" : ['Business Models', 'Life Advice', 'Health & Wellness', 'Stories']
        }
    },
    "required": ["topic", "description"],
}

In [17]:
chain = create_extraction_chain(schema, llm3)

  warn_deprecated(


In [18]:
topics_structured = chain.run(topics_found)

In [19]:
topics_structured

[{'topic_name': 'Direct-to-Consumer Hearing Aids',
  'description': 'Shaan Puri discusses the potential of direct-to-consumer hearing aids as a profitable business.',
  'tag': 'Business Models'},
 {'topic_name': "Children's Play Space Business",
  'description': "Shaan Puri talks about a business idea about a children's play space franchise.",
  'tag': 'Business Models'},
 {'topic_name': "Steph Smith's Career",
  'description': "Sam Parr shares the story of Steph Smith's journey from working at Trends to her current role at Andreessen Horowitz.",
  'tag': 'Stories'},
 {'topic_name': 'Working at Andreessen Horowitz',
  'description': 'Steph Smith shares her experience working at Andreessen Horowitz, discussing the balance between the day-to-day normalcy and the occasional spectacular moments.',
  'tag': 'Stories'},
 {'topic_name': 'Importance of Office Presence',
  'description': 'Shaan Puri suggests the importance of being physically present in the office to seize opportunities.',
  't