# chunking audio files

In [1]:
import os
from pydub import AudioSegment
from random import choices
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps

VAD_SAMPLING_RATE = 16000
CHUNK_SAMPLING_RATE = 44100

model = load_silero_vad()

base_data_dir = '/mnt/e/working/vietnamese_tts/data/original'
base_chunk_dir = '/mnt/e/working/vietnamese_tts/data/chunks'
files = [file for file in os.listdir(base_data_dir) if file.endswith('.mp3')]

for file in files:
    file_path = os.path.join(base_data_dir, file)
    wav = read_audio(file_path) # backend (sox, soundfile, or ffmpeg) required!
    speech_timestamps = get_speech_timestamps(wav, model,
                                            min_speech_duration_ms=1000,
                                            max_speech_duration_s=30,
                                            min_silence_duration_ms=300,
                                            speech_pad_ms=100)
    
    audio = AudioSegment.from_file(file_path)
    for chunks in speech_timestamps:
        start = 1000*chunks['start']/VAD_SAMPLING_RATE
        end = 1000*chunks['end']/VAD_SAMPLING_RATE
        chunks_path = os.path.join(base_chunk_dir, file.replace('.mp3', f'_{start}_{end}.wav'))
        audio[start:end].set_frame_rate(CHUNK_SAMPLING_RATE).export(chunks_path, format='wav')

# transcribe audio chunks

In [1]:
import torch
from transformers import pipeline, WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor

MODEL_ID = "openai/whisper-large-v3"
LANGUAGE = "vi"
TASK = "transcribe"
device = "cuda:0"

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
tokenizer = WhisperTokenizer.from_pretrained(MODEL_ID, language=LANGUAGE, task=TASK)
processor = WhisperProcessor.from_pretrained(MODEL_ID, language=LANGUAGE, task=TASK)
#
processor.tokenizer.pad_token = processor.tokenizer.eos_token
processor.tokenizer.set_prefix_tokens(language=LANGUAGE, task=TASK)

model_oai_ft_v3 = WhisperForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True, 
    use_safetensors=True,
)
model_oai_ft_v3 = model_oai_ft_v3.to(device)
pipe_oai_ft_v3 = pipeline(
    "automatic-speech-recognition",
    model=model_oai_ft_v3,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=256,
    chunk_length_s=10,
    batch_size=32,
    # return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={
        "task": "transcribe",
        "language": "vi",
        "no_repeat_ngram_size": 4, # Avoid repetition
        "return_timestamps": True,
        },
)

In [3]:
import json
import time
import warnings
import os

base_chunk_dir = '/mnt/e/working/vietnamese_tts/data/chunks'
chunks = os.listdir(base_chunk_dir)
chunk_paths = [os.path.join(base_chunk_dir, chunk) for chunk in chunks if chunk.endswith('.wav')]


warnings.filterwarnings('ignore')

start = time.time()
for idx,chunk_path in enumerate(chunk_paths):
    transcript_path = chunk_path.replace('.wav', '.json')
    if os.path.exists(transcript_path):
        continue
    transcriptions = pipe_oai_ft_v3(chunk_path)
    json.dump(transcriptions, open(transcript_path, 'w'))
    if idx % 100 == 0:
        print(idx, time.time() - start) 

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


900 16.016644716262817


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


1000 180.74905467033386
1100 346.78210949897766
1200 498.2749969959259
1300 648.2184693813324
1400 825.8562657833099
1500 997.6656391620636


# demucs, denoise / filter music chunking

In [17]:
from IPython.display import display
from pydub import AudioSegment
import os
import json
import re

EXCLUSIVE_TEXTS = [
    "Hãy subscribe cho kênh Ghiền Mì Gõ",
]
pattern = re.compile("|".join(map(re.escape, EXCLUSIVE_TEXTS)))
def contains_exclusive_texts(s):
    return bool(pattern.search(s))

base_chunk_dir = '/mnt/e/working/vietnamese_tts/data/chunks'
chunks_files = os.listdir(base_chunk_dir)
chunk_audio_paths = [os.path.join(base_chunk_dir, chunk) for chunk in chunks_files if chunk.endswith('.mp3')]
transcript_audio_paths = [os.path.join(base_chunk_dir, chunk) for chunk in chunks_files if chunk.endswith('.json')]

data = []
for audio_file_path in chunk_audio_paths:
    tmp_trans_path = audio_file_path.replace('.mp3', '.json')
    if tmp_trans_path in transcript_audio_paths:
        tmp_spk = audio_file_path.split('/')[-1].split('-')[-0]
        text = json.load(open(tmp_trans_path, 'r'))['text']
        if not contains_exclusive_texts(text):
            data.append(
                [
                    audio_file_path,
                    tmp_spk,
                    "VI-SOUTH",
                    text
                ]
            )
        

In [18]:
import csv

with open('/mnt/e/working/vietnamese_tts/data/train_config/metadata.csv', 'w') as f:
    writer = csv.writer(f, delimiter="|")
    # writer.writerow(['audio', 'speaker', 'language', 'text'])
    writer.writerows(data)