## Downloading and Transcribing a Youtube Clip

In [24]:
from faster_whisper import WhisperModel
import yt_dlp
import os

In [25]:
def download_audio(youtube_url, out_dir="downloads"):
    os.makedirs(out_dir, exist_ok=True)
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': f'{out_dir}/%(id)s.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
        }]
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(youtube_url, download=True)
        return f"{out_dir}/{info['id']}.mp3", info['id']
    
def transcribe(audio_path, model_size="medium", output_dir="transcripts"):
    model = WhisperModel(model_size, compute_type="int8")
    segments, _ = model.transcribe(audio_path)
    os.makedirs(output_dir, exist_ok=True)
    text_chunks = []
    for seg in segments:
        text_chunks.append({
            "start": seg.start,
            "end": seg.end,
            "text": seg.text
        })

    output_path = os.path.join(output_dir, os.path.basename(audio_path).replace(".mp3", ".txt"))
    with open(output_path, "w") as f:
        for chunk in text_chunks:
            f.write(f"{chunk['text']}\n")

    print(f"Transcription saved to: {output_path}")
    return None

In [26]:
download_audio('https://www.youtube.com/watch?v=EKkFOMzwMgc', out_dir = '/Users/jpoberhauser/Desktop/baseballCompanion/data/')

[youtube] Extracting URL: https://www.youtube.com/watch?v=EKkFOMzwMgc
[youtube] EKkFOMzwMgc: Downloading webpage
[youtube] EKkFOMzwMgc: Downloading tv client config
[youtube] EKkFOMzwMgc: Downloading tv player API JSON
[youtube] EKkFOMzwMgc: Downloading ios player API JSON
[youtube] EKkFOMzwMgc: Downloading m3u8 information
[info] EKkFOMzwMgc: Downloading 1 format(s): 251
[download] Destination: /Users/jpoberhauser/Desktop/baseballCompanion/data//EKkFOMzwMgc.webm
[download] 100% of   50.77MiB in 00:00:01 at 33.38MiB/s    
[ExtractAudio] Destination: /Users/jpoberhauser/Desktop/baseballCompanion/data//EKkFOMzwMgc.mp3
Deleting original file /Users/jpoberhauser/Desktop/baseballCompanion/data//EKkFOMzwMgc.webm (pass -k to keep)


('/Users/jpoberhauser/Desktop/baseballCompanion/data//EKkFOMzwMgc.mp3',
 'EKkFOMzwMgc')

This is example is a 24 minute youtube clip and it gets compeltey transcribed in ~13 minutes using the medium model and in using the small model ~5 minutes

In [27]:
transcribe('/Users/jpoberhauser/Desktop/baseballCompanion/data/EKkFOMzwMgc.mp3',
            model_size="small", 
            output_dir="/Users/jpoberhauser/Desktop/baseballCompanion/data/transcripts/")

Transcription saved to: /Users/jpoberhauser/Desktop/baseballCompanion/data/transcripts/EKkFOMzwMgc.txt


### Let's generate and store embeddings 

In [41]:
# !pip install sentence-transformers faiss-cpu
import faiss
import pickle

In [None]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [32]:
# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)

(3, 384)


In [33]:
embeddings

array([[ 0.01919569,  0.12008531,  0.15959838, ..., -0.00536285,
        -0.08109499,  0.05021335],
       [-0.01869035,  0.04151865,  0.0743155 , ...,  0.00486595,
        -0.06190439,  0.0318751 ],
       [ 0.13650198,  0.08227322, -0.02526161, ...,  0.08762044,
         0.03045845, -0.01075751]], shape=(3, 384), dtype=float32)

In [34]:
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)

tensor([[1.0000, 0.6660, 0.1046],
        [0.6660, 1.0000, 0.1411],
        [0.1046, 0.1411, 1.0000]])


## compare model.similarity to FAISS

* obviously its oerkill to use FAISS for three sentence embeddings, but we will need it for hundreds of thousands of text chunks in a real vectorDB. 

* just to make sure we get similar results, we run the code below, and indeed, the first sentence is most similar to the second one, and they both give a similairyt of _around_ 0.66

In [40]:
#### compare to FAISS
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 = Euclidean, or use IndexFlatIP for cosine
index.add(embeddings)

# Query similarity from emb1
D, I = index.search(embeddings[0].reshape(1, -1), k=3)

print("Indices of nearest neighbors:", I)
print("Distances:", D)

Indices of nearest neighbors: [[0 1 2]]
Distances: [[0.         0.66808915 1.7908318 ]]


In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
def build_vector_index(chunks, video_id, save_dir="faiss_index"):
    texts = [c['text'] for c in chunks]
    embeddings = model.encode(texts, show_progress_bar=True)

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    os.makedirs(save_dir, exist_ok=True)
    faiss.write_index(index, f"{save_dir}/{video_id}.index")
    with open(f"{save_dir}/{video_id}_meta.pkl", "wb") as f:
        pickle.dump(chunks, f)