## Semantic search

In this notebook I am going to put the pieces I already have together - transcription, alignment and topic segmentation and I will try out semantic search on top of that.

I will use a random aduio file in english.

In [None]:
import whisperx
import time
import librosa
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import spacy

# 1. Audio transcription

audio_file = "En.m4a"
device = "cpu"

model_whisper = whisperx.load_model("small", device=device, compute_type="float32")
start_time = time.time()
result = model_whisper.transcribe(audio_file)
end_time = time.time()

alignment_model, align_metadata = whisperx.load_align_model(
    language_code=result["language"], device=device
)

audio, sr = librosa.load(audio_file, sr=16000, mono=True)
duration_sec = len(audio) / sr
print(f"Loaded audio sr={sr}, duration={duration_sec:.2f}s")

result_aligned = whisperx.align(
    transcript=result["segments"],
    model=alignment_model,
    align_model_metadata=align_metadata,
    audio=audio,
    device=device,
    return_char_alignments=False
)

# 2. Prepare transcript & sentences

word_segments = result_aligned["word_segments"]
for word in word_segments:
    print(f"[{word['start']:.2f} - {word['end']:.2f}] {word['word'].strip()}")

transcript_text = " ".join([w['word'].strip() for w in word_segments])
transcript_text = " ".join(transcript_text.split())
print("\nFull Transcript:\n", transcript_text)

sentences = [s.strip() for s in transcript_text.replace("?", ".").replace("!", ".").split(".") if s.strip()]
print(f"\nNumber of sentences: {len(sentences)}")

# 3. Sentence embeddings

model_sent = SentenceTransformer("all-mpnet-base-v2")
embeddings = model_sent.encode(sentences, batch_size=8, show_progress_bar=True)
print("Embeddings shape:", embeddings.shape)

# 4. Adaptive topic segmentation

def adaptive_threshold_segmentation(embeddings, method="std", min_size=2, std_factor=1.0, percentile=20):
    num_sentences = embeddings.shape[0]
    sims = [cosine_similarity(embeddings[i-1].reshape(1,-1), embeddings[i].reshape(1,-1))[0][0] 
            for i in range(1, num_sentences)]
    sims = np.array(sims)

    if method == "std":
        threshold = sims.mean() - std_factor * sims.std()
    elif method == "percentile":
        threshold = np.percentile(sims, percentile)
    else:
        raise ValueError("method must be 'std' or 'percentile'")

    pred_segments = [0]
    current_segment = 0
    last_boundary = 0

    for i in range(1, num_sentences):
        sim = cosine_similarity(embeddings[i-1].reshape(1,-1), embeddings[i].reshape(1,-1))[0][0]
        if sim < threshold and (i - last_boundary) >= min_size:
            current_segment += 1
            last_boundary = i
        pred_segments.append(current_segment)

    return pred_segments, threshold

pred_segments, used_threshold = adaptive_threshold_segmentation(
    embeddings, method="percentile", percentile=30, min_size=2
)
print("Adaptive threshold used:", used_threshold)
print("Predicted segments:", pred_segments)


# 5. Extract topic phrases

nlp = spacy.load("en_core_web_sm")

def segment_topic_phrase(sentences, segments, sentence_embeddings, top_n=1):
    segment_dict = {}
    unique_segments = sorted(set(segments))

    for seg_id in unique_segments:
        indices = [i for i, seg in enumerate(segments) if seg == seg_id]
        if not indices:
            continue

        seg_emb = sentence_embeddings[indices].mean(axis=0, keepdims=True)
        seg_text = " ".join([sentences[i] for i in indices])
        doc = nlp(seg_text)
        candidates = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) <= 4]

        if not candidates:
            segment_dict[seg_id] = ["[no phrase found]"]
            continue

        candidate_embeddings = model_sent.encode(candidates, convert_to_numpy=True)
        sims = cosine_similarity(seg_emb, candidate_embeddings)[0]
        top_indices = sims.argsort()[::-1][:top_n]
        top_phrases = [candidates[i] for i in top_indices]

        segment_dict[seg_id] = top_phrases

    return segment_dict

segment_phrases = segment_topic_phrase(sentences, pred_segments, embeddings)

# 6. Output results

print("\n--- Segments & Topics ---")
for seg_id, phrases in segment_phrases.items():
    seg_text = " ".join([sentences[i] for i, seg in enumerate(pred_segments) if seg == seg_id])
    print(f"\nSegment {seg_id}:")
    print(f"Text: {seg_text}")
    print(f"Topic phrase: {phrases[0]}")



  torchaudio.list_audio_backends()
  available_backends = torchaudio.list_audio_backends()


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...


  if ismodule(module) and hasattr(module, '__file__'):
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\Lenovo X1 Carbon\AppData\Local\Programs\Python\Python311\Lib\site-packages\whisperx\assets\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.96) in first 30s of audio...


  audio, sr = librosa.load(audio_file, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded audio sr=16000, duration=278.01s
[0.03 - 0.25] I
[0.29 - 0.43] want
[0.45 - 0.53] to
[0.59 - 0.83] talk
[0.89 - 1.39] about
[1.43 - 2.23] beating
[2.25 - 2.68] stress
[2.72 - 3.02] today.
[3.06 - 4.74] You
[4.78 - 4.98] know,
[5.60 - 5.84] live
[5.88 - 6.06] here
[6.10 - 6.18] in
[6.24 - 6.42] Hong
[6.44 - 6.72] Kong
[6.74 - 6.78] is
[6.84 - 7.08] very
[7.14 - 7.75] stressful,
[7.85 - 8.09] so
[8.11 - 8.13] I
[8.71 - 8.85] think
[9.01 - 9.37] today's
[9.45 - 9.89] topic
[9.97 - 10.03] is
[10.11 - 10.29] very
[10.41 - 10.75] useful
[10.79 - 10.91] for
[11.03 - 11.35] everyone
[11.41 - 11.73] because
[12.97 - 13.08] we
[13.10 - 13.26] can
[13.34 - 13.48] do
[13.52 - 13.92] something
[14.00 - 14.38] about
[14.42 - 14.86] it
[15.80 - 16.00] just
[16.04 - 16.12] to
[16.18 - 16.36] cope
[16.38 - 16.52] with
[16.56 - 16.66] the
[16.68 - 17.58] stress.
[17.56 - 18.34] So
[18.73 - 18.95] maybe
[19.01 - 19.13] you
[19.19 - 19.37] have
[19.47 - 19.69] some
[19.95 - 20.35] change
[20.55 - 2

Batches: 100%|██████████| 6/6 [00:02<00:00,  2.59it/s]


Embeddings shape: (42, 768)
Adaptive threshold used: 0.17107978
Predicted segments: [0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10]

--- Segments & Topics ---

Segment 0:
Text: I want to talk about beating stress today You know, live here in Hong Kong is very stressful, so I think today's topic is very useful for everyone because we can do something about it just to cope with the stress
Topic phrase: stress

Segment 1:
Text: So maybe you have some change in your life, maybe big change or small change, but instead of being afraid, I think you should have a positive attitude and think of change as a normal part of life And I think maybe here in Hong Kong, the family are crafted into a very small housing space
Topic phrase: change

Segment 2:
Text: So maybe you will sometimes argue with your family and I think try to resolve the disagreement with people is very important because then you can build strong re

This is the output of everything put together so I get a transcription with timestamps, the text is segmented into topics and these topics also have names. In the following chunk I am going to test out how semantic serach works based on transformer embeddings.

In [None]:

import torch

def search_topic(query, sentences, segments, sentence_embeddings, segment_phrases, model):
    """
    Returns the best-matching topic (segment) for a query.
    """
    query_emb = model.encode(query, convert_to_tensor=True)

    unique_segments = sorted(set(segments))
    segment_scores = []
    for seg_id in unique_segments:
        indices = [i for i, seg in enumerate(segments) if seg == seg_id]
        if not indices:
            continue
        seg_emb = sentence_embeddings[indices].mean(axis=0, keepdims=True)
        score = util.cos_sim(query_emb, torch.tensor(seg_emb)).item()
        segment_scores.append((seg_id, score))

    best_seg_id, best_score = max(segment_scores, key=lambda x: x[1])
    best_text = " ".join([sentences[i] for i, seg in enumerate(segments) if seg == best_seg_id])
    best_phrase = segment_phrases.get(best_seg_id, ["[no phrase found]"])[0]

    print(f"\nQuery: {query}")
    print(f"Best matching segment ID: {best_seg_id}")
    print(f"Segment topic phrase: {best_phrase}")
    print(f"Similarity score: {best_score:.4f}")
    print(f"\nSegment text:\n{best_text}")

    return best_seg_id, best_text, best_phrase, best_score




In [8]:
query = "health"
search_topic(query, sentences, pred_segments, embeddings, segment_phrases, model_sent)



Query: health
Best matching segment ID: 3
Segment topic phrase: the T word
Similarity score: 0.4734

Segment text:
And how about the T word The T word is treat your body well Because experts say that exercise can reduce stress And also if you eat healthy food, then your brain and your body get the nourishment they need


(3,
 'And how about the T word The T word is treat your body well Because experts say that exercise can reduce stress And also if you eat healthy food, then your brain and your body get the nourishment they need',
 'the T word',
 0.47340062260627747)

The results shows a decent similarity score and a semantically close part of the text, however, it is a bit long and it may not be the best possible option to present in to the user in the application. I would also like to try out another way for semantic search with a pretrained model and see what the results can tell me. 

In [None]:


import pandas as pd
import json 
import torch
from sentence_transformers import SentenceTransformer, util

df = pd.read_csv("topics.csv")

sentences = []
for s in df["sentences"].dropna():
    sentences.extend(json.loads(s))

model = SentenceTransformer("all-mpnet-base-v2")

sentence_embeddings = model.encode(sentences, convert_to_tensor=True)

query = "work"
query_embedding = model.encode(query, convert_to_tensor=True)

cosine_scores = util.cos_sim(query_embedding, sentence_embeddings)

top_idx = torch.argmax(cosine_scores)
best_sentence = sentences[top_idx]
best_score = cosine_scores[0][top_idx].item()

print(f"\nQuery: {query}")
print(f"\nBest matching sentence:\n{best_sentence}")
print(f"\nSimilarity score: {best_score:.4f}")



Query: work

Best matching sentence:
 I look formard to finish it and see the end product.

Similarity score: 0.3232


Here, we see a lower similarity score, however it would still be the highest one matching the query. What I like in this result is that it is shorter and still matches the idea of the query.

### Semantic search is still due reserach and exploration and it will be done in the following weeks since for now it has low priority.