## Transcription API

In [72]:
from time import time
  
def timer(func):
    def wrap_func(*args, **kwargs):
        t1 = time()
        result = func(*args, **kwargs)
        t2 = time()
        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
        return result
    return wrap_func

In [73]:
_input = {
    'yt_link': input()
}

@timer
def transcribe(_input):
    
    import os
    import re
    import subprocess
    from pytube import YouTube
    from stable_whisper import load_model
    from functools import reduce
    
    # Download audio stream of YouTube video
    video = YouTube(_input['yt_link'])
    video_id = video.video_id

    mp4_audio_stream = video\
    .streams\
    .filter(only_audio=True)\
    .filter(file_extension='mp4')\
    .order_by('abr')\
    .last()\
    .download(output_path = f'{os.getcwd()}/mp4/{video_id}/', filename = f'{video_id}.mp4')
    
    # Convert MP4 to WAV
    subprocess.run([f"audioconvert convert mp4/{video_id}/ wav/ --output-format .wav"], shell = True)
    
    # Split Vocals / Instrumentals
    subprocess.run([f"demucs --two-stems=vocals 'wav/{video_id}.wav'"], shell = True)
    
    # Transcribe Lyrics
    whisper_model = load_model('medium')
    segments = model.transcribe(f"{os.getcwd()}/separated/htdemucs/{video_id}/vocals.wav")['segments']
    
    lyrics = pd.DataFrame([
        {'Line': segment['id'], 'Text': segment['text']} for segment in segments
    ]).to_json(orient = 'records')
    word_ts = pd.DataFrame(
        reduce(lambda x,y: x + y, [segment['word_timestamps'] for segment in segments])
    ).drop('token', axis = 1)\
    .to_json(orient = 'records')
    instrumental_filepath = f"{os.getcwd()}/separated/htdemucs/{video_id}/no_vocals.wav"
    
    return {
        'lyric_lines': lyrics,
        'word_timestamps': word_ts,
        'instrumental_filepath': instrumental_filepath
    }
    
transcribe(_input)

 https://www.youtube.com/watch?v=olXmoHwblhU


[ INFO    ] Starting conversion of mp4/olXmoHwblhU/.
[ SUCCESS ] See wav for converted audio.
[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/ptan/Desktop/Harmonai/separated/htdemucs
Separating track wav/olXmoHwblhU.wav


100%|████████████████████████████████████████████████████████████████████████| 175.5/175.5 [01:56<00:00,  1.51seconds/s]


Detected language: english
Function 'transcribe' executed in 230.7385s


{'lyric_lines': '[{"Line":0,"Text":" Hmm"},{"Line":1,"Text":" I"},{"Line":2,"Text":" Waited till I saw the sun"},{"Line":3,"Text":" I don\'t know why I didn\'t come"},{"Line":4,"Text":" Left you by the house of fun. Oh"},{"Line":5,"Text":" Don\'t know why I didn\'t come"},{"Line":6,"Text":" I"},{"Line":7,"Text":" Don\'t know why I didn\'t come"},{"Line":8,"Text":" When I saw the break of day"},{"Line":9,"Text":" I wish that I could fly away"},{"Line":10,"Text":" I"},{"Line":11,"Text":" Instead of kneeling in the sand"},{"Line":12,"Text":" Catching teardrops in my hand my heart is"},{"Line":13,"Text":" Drenched in wine"},{"Line":14,"Text":" You behind my mind forever"},{"Line":15,"Text":" Forever"},{"Line":16,"Text":" ever"},{"Line":17,"Text":" Out across the endless sea"},{"Line":18,"Text":" I would die in ecstasy"},{"Line":19,"Text":" But I\'ll be a beggar bones"},{"Line":20,"Text":" Dropping down this road alone"},{"Line":21,"Text":" My heart is"},{"Line":22,"Text":" Drenched in wine