In [1]:
import os
import json
import pprint
import re

**RENDER**

In [2]:
def convert_timestamp(seconds_start, seconds_end):
    # Convert the start timestamp to minutes and seconds    
    minutes_start = int(seconds_start // 60)
    seconds_start_remainder = seconds_start % 60
    
    # Format the start timestamp to MM:SS.SS
    formatted_start = f"{minutes_start:02}:{seconds_start_remainder:05.2f}"
    
    # Convert the end timestamp to minutes and seconds (optional if needed)
    minutes_end = int(seconds_end // 60)
    seconds_end_remainder = seconds_end % 60
    
    # Format the end timestamp to MM:SS.SS (optional if needed)
    formatted_end = f"{minutes_end:02}:{seconds_end_remainder:05.2f}"
    
    # Return the formatted string
    return f"[{formatted_start}]"

def split_combined_word(word):
    # Use regular expression to split at the capital letter in the middle
    split_words = re.findall(r'[a-z]+|[A-Z][a-z]*', word)
    return split_words

def chunk_lyrics(chunks, segment_length=20):
    "Chunk the lyrics into 30-second chunks, 0.5 seconds overlap."
    # chunks = [chunks[i:i+segment_length] for i in range(0, len(chunks), segment_length)]
    start_sec = 0
    output_chunks = []
    
    output_dict = {
        'text': '',
        'timestamp': [],
    }

    for index,  text_dict in enumerate(chunks):
        start_sec = text_dict['timestamp'][0]
        end_sec = text_dict['timestamp'][1]

        if index == 0: start_of_chunk = text_dict['timestamp'][0]
        
        end_of_chunk = start_of_chunk + segment_length

        if index == 0:
            output_dict['text'] += text_dict['text'] + ' '
            output_dict['timestamp'] = [start_sec, end_sec]

        elif index == len(chunks) - 1:
            output_dict['text'] += text_dict['text']
            output_dict['timestamp'][1] = end_sec
            output_chunks.append(output_dict)
            
        elif end_of_chunk >= end_sec:
            output_dict['text'] += text_dict['text'] + ' '
            output_dict['timestamp'][1] = end_sec
        
        else:
            temp_dict = output_dict.copy()
            output_chunks.append(temp_dict)    
            #Reset
            output_dict['text'] = text_dict['text'] + ' '
            output_dict['timestamp'] = [start_sec, end_sec]
            start_of_chunk = start_sec
            
    
    return output_chunks
    
def output_render(
    json_result,
    format: str = 'phowhisper',
    ):

    if format == 'phowhisper':
        render_output = []
        for text_dict in json_result:
            cur_text = convert_timestamp(text_dict['timestamp'][0], text_dict['timestamp'][1]) + ' ' + text_dict['text']
            render_output.append(cur_text)

        render_output = '\n'.join(render_output)

    elif format == 'whisper':
        chunks = json_result['chunks']
        chunks = chunk_lyrics(chunks, segment_length=30)
        render_output = output_render(
            chunks,
            format='phowhisper'
        )
    
    
    return render_output
    

In [5]:
# phowhisper_output = '/home/anh/Documents/vietnamese-song-scraping/out/PhoWhisper-small/validation-audio-100-pp_nospeech-remove/Ai Về Quảng Ngãi.mp3.json'

output = '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test-utf8/Ai Về Quảng Ngãi.json'
with open(output, 'r') as f:
    data = json.load(f)

rendered_output = output_render(data, format='whisper')
print(rendered_output)

[00:29.98] Bài  Hát  Cuối  Ca  Sĩ  &  Đàn  Ông  HồngNgười  ra  điEm  theo  anh  về  Quảng  Ngãi  một  lần  được  không 
[00:58.78]  Qua  cầu  Bổ  Ly  Trưa  nắng  hạ  là  nước  long  lanh  Ngồi  kề  bên  anh  Nôn  nao  ngắm  ngó  Thương  sao  đất  lành  Đi  qua  nghĩa  hạnh  Qua  sơn  tình  tư  nghĩa  mộng  mơ  Chiều  bến  Tam  Thương  Thương  sông 
[01:28.78]  Trà  hàng  ghế  che  bờ  Đẹp  tình  ba  tơ  Thương  sông  núi  Minh  Long  Sơn  Hà  Em  theo  anh  qua  những  con  đường  Mang  bao  di  tích  đời  đời  Quê  tôi  mồ  đực  đây  rồi  Chiều  quê 
[01:58.08]  chiều  quê  Thương  ai  ai  cấy  mà  non  Thương  anh  Thương  anh  Quảng  Ngãi  em  về  Sơn  tay  giữ  mãi  câu  thề  Trà  Bồng  dịu  dàng  nắng  trong  Bình  Sơn  Yêu  sao  tiếng  hát  nhớ  nhau  Thương  anh  Em  về  cùng 
[02:27.68]  anh  Qua  bao  năm  xuôi  ngược  Nay  trở  về  Quảng  Ngãi  quê  tôi  Qua  Lý  Sơn  xưa  Nay  thuyền  bè  sum  đúc  đôi  bờ  Đức  Phố  mẹ  tôi  Còn  đứng  đó  trong  đợi  con  về  Quảng  Ngãi  

**CORRECTION**

In [None]:
from utils.llm_corrector import load_llm, generate_corrections
from glob import glob
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
llm_model = load_llm('gemini-1.0-pro-latest')


In [7]:
original_lyrics = glob('/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/*.json')

original_lyrics = ['/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Tình Xa Ngàn Khơi.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Em Đã Thương Người Ta Hơn Anh.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Giấc Mơ Nồng.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Đồ Ngốc Anh Yêu Em.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Hen Gặp Lại Trong Chiêm Bao.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Sợ Bắt Đầu.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Giấc Mơ Mình Em.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Có Những Yêu Thương Nào.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Ánh Trăng Bên Thềm.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Chút Nắng Cuối Đông.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Vì Có Khi Đôi Tay.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Anh Nhớ Em Nhiều Lắm.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Ngoảnh Mặt.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Dâng Mẹ Quan Thế Âm.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Little Sài Gòn.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Giữa Hai Vì Sao.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Đạo Hiếu Vu Lan.json', '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test/Yêu Mãi.json']

dest_dir = '/home/anh/Documents/vietnamese-song-scraping/out/whisper-large-v2-100-output-test-txt'
os.makedirs(dest_dir, exist_ok=True)

sexy_files = []

for original_lyric in tqdm.tqdm(original_lyrics):
    with open(original_lyric, 'r') as f:
        transcription_result = json.load(f)

    dest_file = os.path.basename(original_lyric).replace('.json', '.txt')
    dest_file = os.path.join(dest_dir, dest_file)
    if os.path.exists(dest_file):
        continue
    
    transcription_result = output_render(transcription_result,format = 'whisper')
    # try:
    #     transcription_result = generate_corrections(llm_model, transcription_result)


    # except:
    #     print(f'failed to correct {original_lyric}')
    #     sexy_files.append(original_lyric)
    #     continue
    
    with open(dest_file, 'w') as f:
        f.write(transcription_result)    


100%|██████████| 18/18 [00:00<00:00, 581.16it/s]
