## Speaker Diarization

**Installed libraries**
- torch
- torchvision
- tensorflow

**Important libraries**
- Pyannote Audio
- Transfomers
- Whisper from OpenAI

In [1]:
!pip install -r requirements.txt -q

# pynanote audio, whisper requires torch > 2.0, but the Docker image has already torch, tensorflow, torchaudio, torchvision. Therefore, I dont need to install the dependencies while installing the pyannote and transfomer library.
!pip install git+https://github.com/pyannote/pyannote-audio --no-deps -q
!pip install git+https://github.com/openai/whisper.git --no-deps -q 

!pip install protobuf==3.20.*

Collecting git+https://github.com/huggingface/transformers (from -r requirements.txt (line 1))
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-dlituq3u
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-dlituq3u
  Resolved https://github.com/huggingface/transformers to commit 2d71307dc0ee2849f785568f345837e726209fc6
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting asteroid-filterbanks>=0.4
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl (29 kB)
Collecting einops>=0.6.0
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m728.4 kB/s[0m eta [36m0:00:00[0m:--:--[0m
[?25hCollecting huggingface_hub>=0.13.0
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)


In [2]:
!pip check torchvision
!pip check torch

openai-whisper 20230918 requires more-itertools, which is not installed.
openai-whisper 20230918 requires tiktoken, which is not installed.
openai-whisper 20230918 requires triton, which is not installed.
tensorflow 2.8.2 has requirement protobuf<3.20,>=3.9.2, but you have protobuf 3.20.3.
pyannote-audio 2.1.1 has requirement torch>=2.0.0, but you have torch 1.13.1+cu116.
pyannote-audio 2.1.1 has requirement torchaudio>=2.0.0, but you have torchaudio 0.13.1+cu116.
openai-whisper 20230918 requires more-itertools, which is not installed.
openai-whisper 20230918 requires tiktoken, which is not installed.
openai-whisper 20230918 requires triton, which is not installed.
tensorflow 2.8.2 has requirement protobuf<3.20,>=3.9.2, but you have protobuf 3.20.3.
pyannote-audio 2.1.1 has requirement torch>=2.0.0, but you have torch 1.13.1+cu116.
pyannote-audio 2.1.1 has requirement torchaudio>=2.0.0, but you have torchaudio 0.13.1+cu116.


## Download models

Whisper model is automatically downloaded from Hugging Face and saved at cache directory

`~/.cache/huggingface/hub/models--guillaumekln--faster-whisper-large-v2/`
There are four important files:
- config.json
- model.bin
- tokenizer.json
- vocabulary.txt

In [None]:
import whisper

whisper.load_model('

In [None]:
# import whisper
from faster_whisper import WhisperModel
import datetime
import subprocess
from pathlib import Path
import pandas as pd
import re
import time
import os 
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
import wave
import contextlib
from transformers import pipeline


whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
source_languages = {
    "en": "English",
    "zh": "Chinese",
    "de": "German",
    "ja": "Japanese",
    "vi": "Vietnamese"
}
source_language_list = [key[0] for key in source_languages.items()]

print(source_language_list)


MODEL_NAME =  "vumichien/whisper-medium-jp"
lang = "ja"

device = 0 if torch.cuda.is_available() else "cpu"
# pipeline of transformer
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)


os.makedirs('output', exist_ok=True)
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")

# WHIPSER MODEL
whisper_model = 'large-v2'
model = WhisperModel(whisper_model, compute_type="int8")

# AUDIO FILE 
video_file_path = "audio.mp3"


# Read and convert youtube video
_,file_ending = os.path.splitext(f'{video_file_path}')
print(file_ending)
audio_file = video_file_path.replace(file_ending, ".wav")
print("audio file:",audio_file)

['en', 'zh', 'de', 'ja', 'vi']


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

convert mp3 to wav file, measure the duration of audio
rate = 16000Hz. tan so lay mau, toc do lay mau. 

In [23]:
df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])

def convert_time(seconds):
    minutes = int(seconds // 60)
    seconds = int(seconds % 60)
    time_format = "{:02d}:{:02d}".format(minutes, seconds)

    return time_format

def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
    """
    # Transcribe youtube link using OpenAI Whisper
    1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
    2. Generating speaker embeddings for each segments.
    3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
    
    Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
    Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
    """

    options = dict(language=selected_source_lang, beam_size=5, best_of=5)
    transcribe_options = dict(task="transcribe", **options) 
    segments_raw, info = model.transcribe(video_file_path, **transcribe_options)
    # Convert back to original openai format
    segments = []
    i = 0
    for segment_chunk in segments_raw:
        chunk = {}
        chunk["start"] = convert_time(segment_chunk.start)
        chunk["end"] = convert_time(segment_chunk.end)
        chunk["text"] = segment_chunk.text
        segments.append(chunk)
        i += 1
    print("transcribe audio done with fast whisper")
    return segments

## 1.Selected Source Lang = 'ja'

In [33]:
# Transcribe audio
selected_source_lang = 'ja'
num_speakers = 4
segments = speech_to_text(audio_file, selected_source_lang, whisper_model=model, num_speakers=2)

transcribe audio done with fast whisper


In [34]:
segments

[{'start': '00:00',
  'end': '00:27',
  'text': '今日は日本語で来たのさやかとコラボしました 今日はこの会話がどれぐらい聞き取れるかチャレンジしましょう'},
 {'start': '00:27',
  'end': '00:34',
  'text': 'これからみなさんに見せる会話は短いバージョンなんですけど 本当の会話は30分あります'},
 {'start': '00:34',
  'end': '00:41',
  'text': 'で単語と表現付きの全部の会話動画が見たい人は ぜひサブスクに登録してみてください'},
 {'start': '00:41', 'end': '00:47', 'text': 'サブスクに登録して一緒にリスニング力と語彙力を高めましょう'},
 {'start': '00:47', 'end': '00:49', 'text': 'ok じゃあ早速始めましょう'},
 {'start': '00:49',
  'end': '00:53',
  'text': 'はいみなさんこんにちは welcome back to my channel'},
 {'start': '00:53', 'end': '00:56', 'text': '今日はさやかに来てもらいました'},
 {'start': '00:56', 'end': '00:59', 'text': 'いえーい'},
 {'start': '00:59', 'end': '01:00', 'text': 'ありがとう'},
 {'start': '01:00', 'end': '01:03', 'text': 'ありがとう本当に来てくれてありがとう'},
 {'start': '01:03', 'end': '01:04', 'text': 'こちらこそ'},
 {'start': '01:04', 'end': '01:08', 'text': '前話した時はたぶん1年前ぐらいかな'},
 {'start': '01:08', 'end': '01:14', 'text': '1年前に会話動画を一緒に撮ったのが 初めて話した時だったから'},
 {'start': '01:14', 'end': '01:17', 'te

In [35]:
# Transcribe audio
selected_source_lang = 'en'
num_speakers = 4
segments = speech_to_text(audio_file, selected_source_lang, whisper_model=model, num_speakers=2)

transcribe audio done with fast whisper


## Lesson Learned

- You should choose Japanese language because most of time, speakers were talking in Japanese.
- Even though there is an English part in the audio (time segment at minute 10:20 onwards), the program automatically translates it into Japanes -> It's quite good. 