In [1]:
import requests
import json
from minsearch import Index

In [2]:
owner = "DataTalksClub"
repo = "datatalksclub.github.io"
path = "_podcast"

In [3]:
url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/main?recursive=1"
headers = {"Accept": "application/vnd.github+json"}

In [4]:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()

In [5]:
data = response.json()
items = data.get("tree", [])

In [6]:
podcast_files = [
    item["path"] for item in items
    if item["type"] == "blob"
    and item["path"].startswith(f"{path}/")
    and item["path"].endswith(".md")
    and "template.md" not in item["path"].lower()
]

In [7]:
print(len(podcast_files))

184


In [8]:
for f in podcast_files:
    print("-", f)

- _podcast/_s12e08.md
- _podcast/s01e01-roles.md
- _podcast/s01e02-processes.md
- _podcast/s01e03-building-ds-team.md
- _podcast/s01e04-standing-out-as-a-data-scientist.md
- _podcast/s01e05-mentoring.md
- _podcast/s02e01-writing.md
- _podcast/s02e02-developer-advocacy.md
- _podcast/s02e03-open-source.md
- _podcast/s02e04-mlops.md
- _podcast/s02e05-feature-stores.md
- _podcast/s02e06-decision-optimization.md
- _podcast/s02e07-abc-data-science.md
- _podcast/s02e08-personal-branding.md
- _podcast/s02e09-roles-skills-monetizing-ml.md
- _podcast/s02e10-public-speaking.md
- _podcast/s02e11-dataops.md
- _podcast/s02e12-communities.md
- _podcast/s03e01-from-pm-to-ds.md
- _podcast/s03e02-from-analytics-to-data-science.md
- _podcast/s03e03-data-observability.md
- _podcast/s03e04-effective-communication-with-business.md
- _podcast/s03e04-interviewing-300-data-scientists.md
- _podcast/s03e06-from-physics-to-machine-learning.md
- _podcast/s03e07-market-yourself.md
- _podcast/s03e08-data-led-profess

In [9]:
def sliding_window(seq, size, step):
    """Create overlapping chunks using sliding window approach."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        batch = seq[i:i+size]
        result.append(batch)
        if i + size >= n:
            break

    return result

In [10]:
OWNER, REPO, PATH = "DataTalksClub", "datatalksclub.github.io", "_podcast"
SIZE, OVERLAP = 30, 15
STEP = SIZE - OVERLAP  # 15
OUTFILE = "podcast_chunks.json"


In [11]:
tree = requests.get(
    f"https://api.github.com/repos/{OWNER}/{REPO}/git/trees/main?recursive=1",
    headers={"Accept": "application/vnd.github+json"},
    timeout=30,
).json()

In [12]:
files = [
    it["path"] for it in tree.get("tree", [])
    if it.get("type") == "blob"
    and it["path"].startswith(f"{PATH}/")
    and it["path"].endswith(".md")
    and "template.md" not in it["path"].lower()
]

In [13]:
total_chunks = 0
for f in files:
    raw = requests.get(f"https://raw.githubusercontent.com/{OWNER}/{REPO}/main/{f}", timeout=30).text
    paragraphs = [p.strip() for p in raw.split("\n\n") if p.strip()]
    total_chunks += len(sliding_window(paragraphs, SIZE, STEP))

print(f"Files: {len(files)}  |  Chunk size={SIZE}, overlap={OVERLAP}  |  Total chunks: {total_chunks}")

Files: 184  |  Chunk size=30, overlap=15  |  Total chunks: 207


In [14]:
all_chunks = []
for f in files:
    raw = requests.get(f"https://raw.githubusercontent.com/{OWNER}/{REPO}/main/{f}", timeout=30).text
    paragraphs = [p.strip() for p in raw.split("\n\n") if p.strip()]
    for idx, chunk in enumerate(sliding_window(paragraphs, SIZE, STEP)):
        text = " ".join(chunk)
        all_chunks.append({
            "source": f,
            "chunk_id": idx,
            "text": text
        })

In [15]:
with open(OUTFILE, "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

print(f"Saved {len(all_chunks)} chunks from {len(files)} files → {OUTFILE}")

Saved 207 chunks from 184 files → podcast_chunks.json


In [16]:
with open("podcast_chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

print(f"Loaded {len(chunks)} chunks")

Loaded 207 chunks


In [17]:
index = Index(text_fields=["text"])
index.fit(chunks)

<minsearch.minsearch.Index at 0x76d261993da0>

In [18]:
query = "how do I make money with AI?"
results = index.search(query, num_results=5)

In [19]:
print(f"\n Query: {query}\n")
for r in results:
    print(f"Source: {r['source']}")
    print(f"Snippet: {r['text'][:180]}...\n")


 Query: how do I make money with AI?

Source: _podcast/s20e01-trends-in-ai-infrastructure.md
Snippet: ---
episode: 1
guests:
- andreycheptsov
ids:
  anchor: atalksclub/episodes/Redefining-AI-Infrastructure-Open-Source--Chips--and-the-Future-Beyond-Kubernetes--Andrey-Cheptsov-e2u7lc...

Source: _podcast/s15e02-investing-in-open-source-data-tools.md
Snippet: ---
episode: 2
guests:
- belawiertz
ids:
  anchor: atatalksclub/episodes/Investing-in-Open-Source-Data-Tools---Bela-Wiertz-e274dr8
  youtube: 7Bg1JQLnCao
image: images/podcast/s15e...

Source: _podcast/s18e03-ai-for-ecology-biodiversity-and-conservation.md
Snippet: ---
episode: 3
guests:
- tanyabergerwolf
ids:
  anchor: atatalksclub/episodes/AI-for-Ecology--Biodiversity--and-Conservation---Tanya-Berger-Wolf-e2inadi
  youtube: 30tTrozbAkg
imag...

Source: _podcast/s10e09-responsible-and-explainable-ai.md
Snippet: ---
episode: 9
guests:
- supreetkaur
ids:
  anchor: Responsible-and-Explainable-AI---Supreet-Kaur-e1o6mgj
  youtube: 8Eb5m

In [20]:
if results:
    first = results[0]
    print("✅ FIRST EPISODE RESULT:")
    print(first["source"])

✅ FIRST EPISODE RESULT:
_podcast/s20e01-trends-in-ai-infrastructure.md
