In [1]:
from pyannote.audio import Audio

1. Find the number of samples of the original audio file when loaded as a numpy array.

In [10]:
import librosa
wav, _ = librosa.load('TEST-1.wav', sr=16_000, mono=True)
wav

array([ 0.        ,  0.        ,  0.        , ..., -0.00079346,
       -0.00064087, -0.00018311], dtype=float32)

In [11]:
len(wav)

1935244

In [32]:
wav, fs = librosa.load('TEST-1.mp3', mono=False)
len(wav)

2

In [33]:
fs

22050

In [34]:
wav.shape

(2, 2667008)

In [24]:
wav.shape

(2, 5334016)

In [35]:
wav, fs = librosa.load('TEST-1.mp3', mono=False, sr=44100)

In [36]:
wav.shape

(2, 5334016)

In [31]:
fs

44100

6. How many segments are returned by the Whisper model for the given audio?

In [37]:
from faster_whisper import WhisperModel

In [39]:
file = "TEST-1.wav"
whisper = WhisperModel('base', device='cuda', compute_type='int8')
segments, info = whisper.transcribe(file, language='en', task='transcribe', beam_size=5, best_of=5)

In [41]:
n_segments = 0
for _ in segments:
    n_segments += 1
n_segments

36

7. Use the speechbrain/spkrec-ecapa-voxceleb model for speaker embedding extraction. What is the dimension of the speaker embeddings?

In [43]:
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
import torch
speaker_embedding = PretrainedSpeakerEmbedding('speechbrain/spkrec-epaca-voxceleb', device=torch.device('cuda'))
speaker_embedding

<pyannote.audio.pipelines.speaker_verification.SpeechBrainPretrainedSpeakerEmbedding at 0x71cab4d92a30>

In [44]:
speaker_embedding.dimension

192

'speechbrain/spkrec-epaca-voxceleb'

8. How many languages are supported by the Whisper model?

In [47]:
len(set(whisper.supported_languages))

100

In [48]:
# Getting the embeddings
import pandas as pd

In [51]:
segments, info = whisper.transcribe(file, language='en', task='transcribe', beam_size=5, best_of=5)

df = pd.DataFrame([
    (seg.start, seg.end, seg.text) for seg in segments
], columns=['start', 'end', 'text'])
df.head()

Unnamed: 0,start,end,text
0,0.0,2.0,Let's talk about music.
1,2.0,4.0,How often do you listen to music?
2,4.0,7.0,I think I listen to music mostly when I'm dri...
3,7.0,10.0,I think it puts me in such a good mood when I...
4,10.0,13.0,out there on a drive and I play my favorite m...


In [53]:
len(df)

36

In [56]:
from pyannote.audio import Audio
from pyannote.core import Segment

crop_segments = [Segment(r.start, min(r.end, 120)) for _, r in df.iterrows()]
audio = Audio()
crops = [audio.crop(file, seg)[0] for seg in crop_segments]

In [57]:
embeddings = [speaker_embedding(x.unsqueeze(0)).squeeze(0) for x in crops]

In [58]:
import numpy as np

In [59]:
X = np.r_[embeddings]

In [60]:
X.shape

(36, 192)

In [61]:
from sklearn.cluster import KMeans

In [62]:
km = KMeans(n_clusters=3, random_state=42)

In [63]:
km.fit(X)

In [65]:
from sklearn.metrics.pairwise import euclidean_distances

In [67]:
pd.DataFrame(euclidean_distances(km.cluster_centers_), index=range(1, 4), columns=range(1, 4))

Unnamed: 0,1,2,3
1,0.0,106.262466,298.611938
2,106.262466,0.0,313.160583
3,298.611938,313.160583,0.0


In [69]:
df['labels'] = km.labels_

In [70]:
df.head()

Unnamed: 0,start,end,text,labels
0,0.0,2.0,Let's talk about music.,2
1,2.0,4.0,How often do you listen to music?,2
2,4.0,7.0,I think I listen to music mostly when I'm dri...,0
3,7.0,10.0,I think it puts me in such a good mood when I...,0
4,10.0,13.0,out there on a drive and I play my favorite m...,0


In [71]:
km = KMeans(n_clusters=2, random_state=42)

In [72]:
km.fit(X)
df['l_clus2'] = km.labels_
df.head()

Unnamed: 0,start,end,text,labels,l_clus2
0,0.0,2.0,Let's talk about music.,2,0
1,2.0,4.0,How often do you listen to music?,2,0
2,4.0,7.0,I think I listen to music mostly when I'm dri...,0,0
3,7.0,10.0,I think it puts me in such a good mood when I...,0,0
4,10.0,13.0,out there on a drive and I play my favorite m...,0,0


In [73]:
import soundfile

In [75]:
wav, fs = soundfile.read('TEST-1.mp3')

In [76]:
wav.shape

(5334016, 2)

In [77]:
fs

44100