In [1]:
from openai import OpenAI
import json

openai_client = OpenAI()

def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [2]:
import requests
import os
import asyncio

async def fetch_podcast_markdown_files():
    """
    Scrape all .md files from the DataTalksClub Podcast GitHub repo
    (https://github.com/DataTalksClub/datatalksclub.github.io/tree/main/_podcast)
    and return a list where each element is the content of one .md file.

    Returns:
        List[str]: List of markdown file contents.
    """
    api_url = "https://api.github.com/repos/DataTalksClub/datatalksclub.github.io/contents/_podcast"
    r = requests.get(api_url)
    r.raise_for_status()
    files = r.json()
    md_files = [f for f in files if (f['name'].endswith('.md') and not f['name'].startswith('_'))]
    contents = []
    for f in md_files:
        download_url = f['download_url']
        resp = requests.get(download_url)
        resp.raise_for_status()
        contents.append({f['name']: resp.text, 'URL': download_url})
    return contents

# Example usage:
podcast_md_list = await fetch_podcast_markdown_files()


In [3]:
# Download the data (only for podcasts). How many records are there?
len(podcast_md_list)

186

In [4]:
print(list(podcast_md_list[80].keys())[0])
print(podcast_md_list[80][list(podcast_md_list[80].keys())[0]])

s09e08-from-open-source-maintainer-to-founder.md
---
episode: 8
guests:
- willmcgugan
ids:
  anchor: From-Open-Source-Maintainer-to-Founder---Will-McGugan-e1kqtu5
  youtube: bwfR9dyxf1M
image: images/podcast/s09e08-from-open-source-maintainer-to-founder.jpg
links:
  anchor: https://anchor.fm/datatalksclub/episodes/From-Open-Source-Maintainer-to-Founder---Will-McGugan-e1kqtu5
  apple: https://podcasts.apple.com/us/podcast/designing-a-data-science-organization-lisa-cohen/id1541710331?i=1000569172916
  spotify: https://open.spotify.com/episode/4JAwU2jQuXu4MoMucsE899?si=6ed45b98dd4a415a
  youtube: https://www.youtube.com/watch?v=bwfR9dyxf1M
season: 9
short: From Open-Source Maintainer to Founder
title: From Open-Source Maintainer to Founder
transcript:
- line: This week, we'll talk about working on open source. We have a special guest
    today, Will. Will is a software engineer and author. He's quite an enthusiastic
    developer of open source software and he is the creator of some very 

# Chunking 

In [5]:
def sliding_window(seq, size, step):
    """Create overlapping chunks using sliding window approach."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    # Split the sequence into words (by whitespace)
    words = seq.split()
    n = len(words)
    chunks = []

    # Extract chunks of `size` words, stepping by `step` words
    for i in range(0, n, step):
        chunk = words[i:i+size]
        if chunk:
            chunks.append(" ".join(chunk))
        if i + size >= n:
            break
    return chunks




In [6]:
all_chunks = []
i = 0 
for episode in podcast_md_list:
    key = list(episode.keys())[0]
    chunks_episode = episode[key]
    for chunk in sliding_window(chunks_episode, 30, 15):
        all_chunks.append({'id': i, 'episode': key, 'text': chunk, 'URL': episode['URL']})
        i += 1
    
all_chunks[0]

{'id': 0,
 'episode': 's01e01-roles.md',
 'text': '--- title: "Data Team Roles Explained" short: "Roles in a Data Team" guests: [alexeygrigorev] image: images/podcast/s01e01-roles.jpg keywords: "data team roles, data scientist, data engineer, machine learning engineer, data analyst, MLOps',
 'URL': 'https://raw.githubusercontent.com/DataTalksClub/datatalksclub.github.io/main/_podcast/s01e01-roles.md'}

## RAG

In [7]:
from minsearch import Index

index = Index(text_fields=["text"])
index.fit(all_chunks)

<minsearch.minsearch.Index at 0x7f7395e541a0>

In [8]:
results = index.search("how do I make money with AI?", num_results=5)

In [13]:


def search(query):
    """Search for relevant documents."""
    return index.search(
        query=query,
        num_results=15
    )

instructions = """
Answer the QUESTION based on the CONTEXT from podcast transcripts.

The CONTEXT contains multiple results. Each result has:
- <metadata>: information about the source
  - <episode>: the episode filename
  - <chunk_id>: the unique chunk identifier
- <text>: the relevant transcript excerpt

Use only the facts from the CONTEXT when answering the QUESTION.

When citing information, reference which chunk_id came from and mention the episode.
When citing information, also mention the URL of the episode so it can be accessed directly.
If the question is discussed in multiple results, cite all of them.

Don't use markdown or any formatting in the output.
""".strip()

def build_prompt(question, search_results):
    """Build a structured prompt with separate tags for each result."""
    context_parts = []
    
    for idx, result in enumerate(search_results, 1):
        context_parts.append(f"""<result id="{idx}">
  <metadata>
    <episode>{result.get('episode', 'unknown')}</episode>
    <chunk_id>{result.get('id', 'unknown')}</chunk_id>
    <URL>{result.get('URL', 'unknown')}</URL>
  </metadata>
  <text>
{result.get('text', '')}
  </text>
</result>""")
    
    context = "\n\n".join(context_parts)
    
    prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()
    
    return prompt_template.format(question=question, context=context)

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    response = llm(prompt, instructions=instructions)
    return response

In [14]:
answer = rag("how do I make money with AI?")
print(answer)

To make money with AI, consider the following approaches:

1. **Develop and Sell AI-Based Products**: Create applications or tools that leverage AI technologies and sell them to consumers or businesses. This could include AI software solutions, chatbots, or predictive analytics tools (s02e09, chunk_id 5019, available at https://raw.githubusercontent.com/DataTalksClub/datatalksclub.github.io/main/_podcast/s02e09-roles-skills-monetizing-ml.md).

2. **Offer Consulting Services**: If you have expertise in AI, you can provide consulting services to businesses looking to implement AI solutions in their operations (s04e07, chunk_id 19385, available at https://raw.githubusercontent.com/DataTalksClub/datatalksclub.github.io/main/_podcast/s04e07-launching-a-startup.md).

3. **Content Creation**: Share your knowledge of AI through content creation, such as YouTube videos or writing books, which can generate advertising revenue or sales (s06e01, chunk_id 28033, available at https://raw.githubuserc

In [15]:
all_chunks[5019]

{'id': 5019,
 'episode': 's02e09-roles-skills-monetizing-ml.md',
 'text': "I'm far more valuable. I understand how to apply research, how to take something that's new, build it, and make money with it. who: Vin - line: You also have",
 'URL': 'https://raw.githubusercontent.com/DataTalksClub/datatalksclub.github.io/main/_podcast/s02e09-roles-skills-monetizing-ml.md'}

In [11]:

answer = rag("What are the lastest advancedments in LLM?")
print(answer)

The latest advancements in Large Language Models (LLMs) include several notable areas:

1. **Integration with Database Technologies**: LLMs are being used alongside vector databases to enhance data retrieval and processing. For instance, an LLM can take user queries in plain English, translate them into SQL, and interact with structured data efficiently (result 11, episode s19e01). 

2. **Contextual Output Engineering**: There is a focus on influencing LLM outputs by carefully engineering the context in which queries are made. This helps produce more meaningful results, demonstrating advancements in how LLMs can process and understand inputs (result 8, episode s22e01).

3. **Embedding and Feature Extraction**: The use of LLMs for embedding data chunks allows for effective feature extraction, creating a foundation for improved performance in various applications (result 10, episode s18e02).

4. **Personalization and Knowledge Utilization**: LLMs are evolving to utilize personal data (su