In [None]:
!pip install yt-dlp demucs
!pip uninstall torchtext torchvision torchdata -y
!pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1  
!pip install git+https://github.com/m-bain/whisperx.git

In [None]:
import yt_dlp
import whisperx
import whisper
import torch
import wave
import locale
from yt_karaoke.ass_parser import write_ass_file
locale.getpreferredencoding = lambda: "UTF-8" # https://github.com/googlecolab/colabtools/issues/3409

# Change youtube here
YT_ID = 'SWFA0d2vIUk'
whisper_model = "large"
alignment_model = 'WAV2VEC2_ASR_LARGE_LV60K_960H'

device = "cpu"
if torch.cuda.is_available():
  device = 'cuda'

print(f"using device={device}")

In [None]:
# Download audio from youtube
ydl_opts = {
    'format': 'mp3/bestaudio/best',
    'outtmpl': 'data/%(id)s.%(ext)s',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3'
    }]
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    output = ydl.download([f"https://www.youtube.com/watch?v={YT_ID}"])

In [None]:
# Separate vocals from track
cmd = f"demucs --device={device} {'--shifts=10' if device == 'cuda' else ''} --two-stems=vocals --out=data/separated/{YT_ID} data/{YT_ID}.mp3"
!{cmd}

In [None]:
# # Transcribe with Whisper
separated_path = f"/content/data/separated/{YT_ID}/htdemucs/{YT_ID}"
audio_file = f"{separated_path}/vocals.wav"

model = whisper.load_model(whisper_model, device)
result = model.transcribe(audio_file)

# Realign with Whisperx
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device, model_name=alignment_model)
transcript = whisperx.align(result["segments"], model_a, metadata, audio_file, device)

In [None]:
# Write the ass file

subtitles_path = f"{YT_ID}.ass"
write_ass_file(transcript, subtitles_path)

In [None]:
# Encode karaoke video
instrumental_path = f"{separated_path}/no_vocals.wav"
with wave.open(instrumental_path, 'r') as f:
    frames = f.getnframes()
    rate = f.getframerate()
    duration = round(frames / float(rate), 2)

cmd = f"ffmpeg -y -f lavfi -i color=size=1280x720:duration={duration}:rate=24:color=black -i {instrumental_path} -vf \"ass={subtitles_path}\" -shortest -c:v libx264 -c:a aac -b:a 192k karaoke.mp4"
!{cmd}

print("karaoke.mp4 ready!")