<a href="https://colab.research.google.com/github/imusicmash/stanford_llm_python/blob/main/LLM_RAG_books_from_podcast_transcript.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# code to find books mentioned in podcast transcipts, using llama_index and openai
# using concepts we learned in LLM4BIZ Stanford class
# author: Al Nevarez, March 2024

In [1]:
# install openai and llama-index
!pip install openai
!pip install llama-index --upgrade

Collecting openai
  Downloading openai-1.13.3-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/227.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.4/227.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py

In [2]:
from openai import OpenAI
from google.colab import userdata

open_ai_key = userdata.get('openai')
client = OpenAI(api_key=open_ai_key)

In [3]:
import os
os.environ["OPENAI_API_KEY"] = open_ai_key

In [4]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

In [24]:
# Find books mentioned in a single podcast transcript.. a test to see if we can extract this easily.
# the following line provided in class doesn't exactly work, it seems to read all the files in the directory
# documents = SimpleDirectoryReader("./").load_data("lennypodcast_20240303.txt")

# tried this way instead to read one file and it seems better, works now to find the books in THIS transcript
reader = SimpleDirectoryReader(
    input_files=["lennypodcast_20240303.txt"]
)
documents = reader.load_data()

In [20]:
# commented out for now the alter the chunk size to try to extract books
# service_context = ServiceContext.from_defaults(chunk_size=800)
index = VectorStoreIndex.from_documents(
    documents, #service_context=service_context
)
query_engine = index.as_query_engine()

response = query_engine.query(
    "What are the books and their author mentioned in this podcast?"
)
print(response)

The books mentioned in the podcast are "The Dream Machine" by J.C.R Licklider and "Good Inside" by Dr. Becky.


In [6]:
# now find books from multiple transcript files
# this is setup to pull in the 4 files.
# note you can also use SimpleDirectoryReader and set multiple file paths
# documents = SimpleDirectoryReader(input_dir="./", required_exts=[".txt"])

# List of file paths
file_paths = ["lennypodcast_20240303.txt", "lennypodcast_20240225.txt", "lennypodcast_20240222.txt", "lennypodcast_20240218.txt"]
#file_paths = ["lennypodcast_20240303.txt", "lennypodcast_20240225.txt"]
#file_paths = ["lennypodcast_20240303.txt"]
#file_paths = ["lennypodcast_20240222.txt"]

reader = SimpleDirectoryReader(input_files = file_paths)
#reader = SimpleDirectoryReader(input_files = ["./lennypodcast_20240303.txt", "./lennypodcast_20240225.txt"])

documents = reader.load_data()


In [19]:
# try adding chunking
from llama_index.core import ServiceContext

service_context = ServiceContext.from_defaults(chunk_size=1024)

  service_context = ServiceContext.from_defaults(chunk_size=1024)


In [20]:
# create an index and send a query to find the books
# service_context = ServiceContext.from_defaults(chunk_size=300)

index = VectorStoreIndex.from_documents(
    documents, service_context=service_context
)
query_engine = index.as_query_engine()

response = query_engine.query(
    "What are the books mentioned in these podcasts.  You should find more than 3 books."
)
print(response)

The books mentioned in these podcasts are not explicitly stated.


In [30]:
# This isn't working when i try with multiple files..
# the response is always:
# The books mentioned in these podcasts are not explicitly stated.


In [21]:
# March 5
# asking other questions about the wisdom shared in those 4 podcasts
# ok.. this does a reasonable job, but why only from 1 doc?
response = query_engine.query(
    "What are some product strategies mentioned in the podcasts? Please list them here in a bulleted list. Use all 4 files. Include the strategy and the filename you found that in."
)
print(response)

- Focus on creating products that are well-designed and deliver high quality, like the folding phone mentioned in lennypodcast_20240218.txt
- Utilize AI tools to enhance productivity and content creation, such as the Opus Clip app for making video clips, as discussed in lennypodcast_20240218.txt
- Wait for the right timing to release a product, even if it means delaying its launch, as seen with the movie Maverick mentioned in lennypodcast_20240218.txt
- Incorporate user feedback and ensure products work better than expected to delight users, similar to the software that ties everything together seamlessly, as highlighted in lennypodcast_20240218.txt
