In [1]:
from faster_whisper import WhisperModel

In [2]:
whisper = WhisperModel('base', device='cuda', compute_type='int8')

In [56]:
file = "TEST-1.wav"
segments, info = whisper.transcribe(file, language='en', task='transcribe', beam_size=5, best_of=5)

In [54]:
len([k for k in segments])

36

In [7]:
import pandas as pd

In [57]:
df = pd.DataFrame([
    (seg.start, seg.end, seg.text) for seg in segments
], columns=['start', 'end', 'text'])
df.head()

Unnamed: 0,start,end,text
0,0.0,2.0,Let's talk about music.
1,2.0,4.0,How often do you listen to music?
2,4.0,7.0,I think I listen to music mostly when I'm driving.
3,7.0,10.0,I think it puts me in such a good mood when I'm like
4,10.0,13.0,out there on a drive and I play my favorite music.


In [58]:
df.tail()

Unnamed: 0,start,end,text
31,107.0,109.0,"what is this called, what is that called?"
32,109.0,112.0,"Because usually when you see pictures, you don't usually know what it's called."
33,112.0,115.0,"So when you read a magazine or where you get the knowledge,"
34,115.0,118.0,"that's where you get to know, oh, this fabric is called this,"
35,118.0,121.0,or this decoration is called that.


In [9]:
import torch
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding

speaker_embedding = PretrainedSpeakerEmbedding('speechbrain/spkrec-epaca-voxceleb', device=torch.device('cuda'))

In [11]:
from pyannote.audio import Audio
from pyannote.core import Segment

In [14]:
crop_segments = [Segment(r.start, min(r.end, 120)) for _, r in df.iterrows()]
crop_segments[:3]

[<Segment(2, 4)>, <Segment(4, 7)>, <Segment(7, 10)>]

In [15]:
audio = Audio()

In [18]:
crops = [audio.crop(file, seg)[0] for seg in crop_segments]

In [20]:
crops[0].shape

torch.Size([1, 32000])

In [21]:
crops[1].shape

torch.Size([1, 48000])

In [25]:
x = crops[0]

In [27]:
x.shape

torch.Size([1, 32000])

In [31]:
embeddings = [speaker_embedding(x.unsqueeze(0)).squeeze(0) for x in crops]

In [32]:
import numpy as np

In [33]:
embeddings = np.r_[embeddings]
embeddings.shape

(35, 192)

In [36]:
from sklearn.cluster import AgglomerativeClustering

In [37]:
clus = AgglomerativeClustering(n_clusters=2)
clus.fit(embeddings)

clus.labels_

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [38]:
df['speaker'] = [f"Speaker {i}" for i in clus.labels_]

In [39]:
df.head()

Unnamed: 0,start,end,text,speaker
0,2.0,4.0,How often do you listen to music?,Speaker 1
1,4.0,7.0,I think I listen to music mostly when I'm dri...,Speaker 0
2,7.0,10.0,I think it puts me in such a good mood when I...,Speaker 0
3,10.0,13.0,out there on a drive and I play my favorite m...,Speaker 0
4,13.0,18.0,"I'm usually into Afro music a lot, hip-hop an...",Speaker 0


In [42]:
df[['start', 'end']] = df[['start', 'end']].astype(int)

In [47]:
def seconds_to_srt_time(seconds):
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    milliseconds = 0  # As we're only given seconds, we'll set milliseconds to zero
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

# Updated function to convert DataFrame with integer times to SRT format
def dataframe_to_srt(df):
    srt_content = ''
    for i, row in df.iterrows():
        start_time = seconds_to_srt_time(row['start'])
        end_time = seconds_to_srt_time(row['end'])
        srt_content += f"{i + 1}\n{start_time} --> {end_time}\n{row['speaker']}\n\n"
    return srt_content

srt = dataframe_to_srt(df)
with open('TEST-1.srt', 'w') as fout:
    fout.write(srt)

In [51]:
pd.options.display.max_colwidth = 100
df[['text', 'speaker']]

Unnamed: 0,text,speaker
0,How often do you listen to music?,Speaker 1
1,I think I listen to music mostly when I'm driving.,Speaker 0
2,I think it puts me in such a good mood when I'm like,Speaker 0
3,out there on a drive and I play my favorite music.,Speaker 0
4,"I'm usually into Afro music a lot, hip-hop and Afro and R&B.",Speaker 0
5,So I prefer listening to music when I'm driving,Speaker 0
6,"or sometimes when I'm working out at the gym, something like that.",Speaker 0
7,Is music an important subject in schools in your country?,Speaker 1
8,In schools in my country.,Speaker 0
9,"It is because, you know, I'm from India.",Speaker 0


In [59]:
import faster_whisper

In [60]:
faster_whisper.__version__

'0.10.1'

In [63]:
len(whisper.supported_languages)

100

In [64]:
from sklearn.datasets import load_iris

In [68]:
X, y = load_iris(return_X_y=True, as_frame=True)

In [66]:
from sklearn.cluster import KMeans

In [76]:
iris = load_iris()
labels = iris.target_names

In [93]:
km = KMeans(n_clusters=3, random_state=42)
km.fit(X)

pd.DataFrame(np.c_[labels, km.labels_]).drop_duplicates().sort_values(0)

Unnamed: 0,0,1
0,setosa,1
50,versicolor,0
51,versicolor,2
100,virginica,0
101,virginica,2


In [79]:
labels = labels[y]