In [None]:
import re
import tqdm

def split_by_capitalization(transcription):
    result = []
    chunks = transcription['chunks']
    
    current_line = ""


    for chunk in chunks:
        text = chunk['text'].strip()
        start_time = chunk['timestamp'][0]
        
        # Check if the current text has capitalization at the start or in the middle of the word
        cap_search = re.search(r'[A-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝỲĂĐĨŨƠƯẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂẾỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰỴỶỸ]', text)
        if cap_search or not current_line:
            
            if cap_search.span() != (0,1):
                split_texts = re.split(r'(?=[A-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝỲĂĐĨŨƠƯẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂẾỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰỴỶỸ])', text)
                current_line = current_line + ' ' + ' '.join(split_texts[:-1])
                text = split_texts[-1]

            result.append(current_line.strip())
            
            # Save the current line with timestamp
            minutes = int(start_time // 60)
            seconds = start_time % 60
            timestamp = f"[{minutes:02}:{seconds:05.2f}]"
            # result.append(f"{timestamp}{current_line.strip()}")
            current_line = f"{timestamp} "+ text

        else:
            # Append to the current line
            current_line += ' ' + text
    
    
    # Add the last line if it exists
    if current_line:
        result.append(current_line.strip())

    return result


In [3]:
from transformers import pipeline
asr_pipeline = pipeline(
   "automatic-speech-recognition",
   model="xyzDivergence/whisper-medium-vietnamese-lyrics-transcription", chunk_length_s=30, device='cuda', torch_dtype='float16' ,
   return_timestamps=True,
   tokenizer="xyzDivergence/whisper-medium-vietnamese-lyrics-transcription"
)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
transcription = asr_pipeline("sample/0 Giờ 2 Phút.mp3", return_timestamps="word")


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


In [5]:
import pprint
pprint.pprint(transcription)

{'chunks': [{'text': 'Trời', 'timestamp': (24.12, 24.68)},
            {'text': ' đêm', 'timestamp': (24.68, 25.26)},
            {'text': ' nay', 'timestamp': (25.26, 25.8)},
            {'text': ' không', 'timestamp': (25.8, 26.36)},
            {'text': ' một', 'timestamp': (26.36, 26.86)},
            {'text': ' vì', 'timestamp': (26.86, 27.36)},
            {'text': ' sao', 'timestamp': (27.36, 28.32)},
            {'text': ' Gió', 'timestamp': (28.32, 28.88)},
            {'text': ' vẫn', 'timestamp': (28.88, 29.4)},
            {'text': ' nhẹ', 'timestamp': (29.4, 29.88)},
            {'text': ' nhàng', 'timestamp': (29.88, 31.72)},
            {'text': ' Thôi', 'timestamp': (31.72, 32.3)},
            {'text': ' sao', 'timestamp': (32.3, 32.66)},
            {'text': ' lòng', 'timestamp': (32.66, 32.98)},
            {'text': ' ta', 'timestamp': (32.98, 33.32)},
            {'text': ' luôn', 'timestamp': (33.32, 33.7)},
            {'text': ' nặng', 'timestamp': (33.7, 34.46)},

In [None]:
# Usage
formatted_text = split_by_capitalization(transcription)
print(formatted_text)