In [16]:
import os
from urllib.parse import quote
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.proxies import WebshareProxyConfig
import requests
import yaml

In [2]:
PROXY_USER = os.environ["WEBSHARE_PROXY_USER"]
PROXY_PASS = quote(os.environ["WEBSHARE_PROXY_PASS"])  

In [3]:
ytt_api = YouTubeTranscriptApi(
    proxy_config=WebshareProxyConfig(
        proxy_username=PROXY_USER,
        proxy_password=PROXY_PASS,
        filter_ip_locations=["de", "us"],
    )
)

In [4]:
video_id = '4bl2TSHD_Fc'
transcript = ytt_api.fetch(video_id)

In [5]:
transcript[10]

FetchedTranscriptSnippet(text='subscribe to our YouTube channel. Very', start=19.359, duration=3.361)

In [6]:
def format_timestamp(seconds: float) -> str:
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)

    if hours == 0:
        return f"{minutes}:{secs:02}"
    return f"{hours}:{minutes:02}:{secs:02}"

def make_subtitles(transcript) -> str:
    lines = []

    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace('\n', ' ')
        lines.append(ts + ' ' + text)

    return '\n'.join(lines)

In [7]:
subtitles = make_subtitles(transcript)

In [8]:
print(subtitles)

0:00 Hi everyone, welcome to our event. This
0:03 event is brought to you by data talks
0:04 club which is a community of people who
0:06 love data. We have weekly events. Today
0:08 is one of such events. If you want to
0:10 find out more about the events we have,
0:11 there is a link in the description. Go
0:13 there, click on that link and you'll see
0:15 you will see all the event we have in
0:17 our pipeline. Then do not forget to
0:19 subscribe to our YouTube channel. Very
0:20 important. This way you'll get notified
0:22 about all the futures we have. And last
0:25 but not least, we have an amazing Slack
0:27 community where you can hang out with
0:28 other data enthusiasts. There is a link
0:30 in the description. Click on that link.
0:32 Uh sign up and then you'll get invite uh
0:35 you'll get invited to our Slack. During
0:38 today's interview, you can ask any
0:39 question you want. There is a pinned
0:41 link in the live chat. Click on that
0:43 link, ask your questions, an

In [9]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

In [10]:
#Creating index with custom analyzers

stopwords = [
    "a","about","above","after","again","against","all","am","an","and","any",
    "are","aren","aren't","as","at","be","because","been","before","being",
    "below","between","both","but","by","can","can","can't","cannot","could",
    "couldn't","did","didn't","do","does","doesn't","doing","don't","down",
    "during","each","few","for","from","further","had","hadn't","has","hasn't",
    "have","haven't","having","he","he'd","he'll","he's","her","here","here's",
    "hers","herself","him","himself","his","how","how's","i","i'd","i'll",
    "i'm","i've","if","in","into","is","isn't","it","it's","its","itself",
    "let's","me","more","most","mustn't","my","myself","no","nor","not","of",
    "off","on","once","only","or","other","ought","our","ours","ourselves",
    "out","over","own","same","shan't","she","she'd","she'll","she's","should",
    "shouldn't","so","some","such","than","that","that's","the","their",
    "theirs","them","themselves","then","there","there's","these","they",
    "they'd","they'll","they're","they've","this","those","through","to",
    "too","under","until","up","very","was","wasn't","we","we'd","we'll",
    "we're","we've","were","weren't","what","what's","when","when's","where",
    "where's","which","while","who","who's","whom","why","why's","with",
    "won't","would","wouldn't","you","you'd","you'll","you're","you've",
    "your","yours","yourself","yourselves",
    "get"
]

index_settings = {
    "settings": {
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": stopwords
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                },
                "english_possessive_stemmer": {
                    "type": "stemmer",
                    "language": "possessive_english"
                }
            },
            "analyzer": {
                "english_with_stop_and_stem": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_possessive_stemmer",
                        "english_stop",
                        "english_stemmer"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "english_with_stop_and_stem",
                "search_analyzer": "english_with_stop_and_stem"
            },
            "subtitles": {
                "type": "text",
                "analyzer": "english_with_stop_and_stem",
                "search_analyzer": "english_with_stop_and_stem"
            }
        }
    }
}

index_name = "podcasts"
    
es.indices.create(index=index_name, body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [podcasts/BidIlk_3SK-AIiMhXzCo-Q] already exists')

In [11]:
#Document indexing 

doc = {
    "video_id": video_id,
    "title": "Building Pet Health Tech",
    "subtitles": subtitles
}

es.index(index="podcasts", id=video_id, document=doc)

ObjectApiResponse({'_index': 'podcasts', '_id': '4bl2TSHD_Fc', '_version': 3, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 2})

In [12]:
# Query / Search
# Return the top `size` (5) most relevant documents, ranked by Elasticsearch relevance (_score).
#
# How Elasticsearch chooses the top 5:
# - Analyze the query text ("how to understand dogs"):
#     • remove stop words (e.g. "how", "to")
#     • apply stemming (e.g. "understanding" → "understand", "dogs" → "dog")
# - Search across fields:
#     • title (boosted x3, so matches here are more important)
#     • subtitles
# - Compute a relevance score (_score) for each matching document
# - Sort documents by _score (highest first)
# - Return only the top 20 documents

def search_videos(query: str, size: int = 5):
    body = {
        "size": size,
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^3", "subtitles"], #if query matches the title it's 3 times more important 
                "type": "best_fields",
                "analyzer": "english_with_stop_and_stem" 
            }
        },
        "highlight": {
            "pre_tags": ["*"],
            "post_tags": ["*"],
            "fields": {
                "title": {
                    "fragment_size": 150,
                    "number_of_fragments": 1
                },
                "subtitles": {
                    "fragment_size": 150,
                    "number_of_fragments": 1 
                }
            }
        } #Extracts and returns small text snippets where the query matched
    }
    
    response = es.search(index="podcasts", body=body)
    hits = response.body['hits']['hits']
    
    results = []
    for hit in hits:
        highlight = hit['highlight']
        highlight['video_id'] = hit['_id']
        results.append(highlight)

    return results

In [13]:
results = search_videos("how to understand dogs")

In [14]:
results

[{'subtitles': ['So you can *understand* if this is\n36:07 uh if the *dog* is sleeping, if the *dog* is\n36:09 moving, if the *dog* is playing other\n36:11 things.'],
  'video_id': '4bl2TSHD_Fc'}]

In [49]:
url = 'https://raw.githubusercontent.com/DataTalksClub/datatalksclub.github.io/refs/heads/main/_data/events.yaml'
#url = 'https://raw.githubusercontent.com/DataTalksClub/datatalksclub.github.io/187b7d056a36d5af6ac33e4c8096c52d13a078a7/_data/events.yaml'

content = requests.get(url).content

In [50]:
events_data = yaml.load(content, yaml.SafeLoader)

In [51]:
podcasts = [d for d in events_data if (d.get('type') == 'podcast') and (d.get('youtube'))]

In [52]:
print(f"Found {len(podcasts)} podcasts")

Found 196 podcasts


In [53]:
videos = []

for podcast in podcasts:
    _, video_id = podcast['youtube'].split('watch?v=')

    # Skip problematic videos
    if video_id in ['FRi0SUtxdMw', 's8kyzy8V5b8']:
        continue

    videos.append({
        'title': podcast['title'],
        'video_id': video_id
    })

print(f"Will process {len(videos)} videos")

Will process 194 videos


In [54]:
!uv add tqdm

[2K[2mResolved [1m106 packages[0m [2min 365ms[0m[0m                                       [0m
[2K[2mPrepared [1m1 package[0m [2min 56ms[0m[0m                                               
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2K[2mInstalled [1m1 package[0m [2min 7ms[0m[0m                                  [0m
 [32m+[39m [1mtqdm[0m[2m==4.67.1[0m


In [55]:
from tqdm.auto import tqdm

In [64]:
for video in tqdm(videos):
    video_id = video['video_id']
    video_title = video['title']

    if es.exists(index='podcasts', id=video_id):
        print(f'already processed {video_id}')
        continue

    transcript = ytt_api.fetch(video_id)
    subtitles = make_subtitles(transcript)

    doc = {
        "video_id": video_id,
        "title": video_title,
        "subtitles": subtitles
    }
    
    es.index(index="podcasts", id=video_id, document=doc)

  0%|          | 0/194 [00:00<?, ?it/s]

already processed 4bl2TSHD_Fc
