In [None]:
#@title 0: Check GPU

!nvidia-smi

In [None]:
#@title 1: Download training model

!wget -N "https://huggingface.co/uki-kun/jokowi-so-vits-svc-model/resolve/main/G_1528.pth" -P model/jokowi/
!wget -N "https://huggingface.co/uki-kun/jokowi-so-vits-svc-model/resolve/main/config.json" -P model/jokowi/

In [None]:
#@title 2: Install library for "downloading YouTube video and converting it to .wav"

!pip install yt_dlp
!pip install ffmpeg
!mkdir yt_audio

In [None]:
#@title 3: Install Demucs for separating vocal

!python3 -m pip install -U demucs

In [None]:
#@title 4: Install dependencies for AI (may take a long time)

!python -m pip install -U pip wheel
%pip install -U ipython
%pip install -U so-vits-svc-fork

In [None]:
#@title 5: Download YouTube video and convert it to .wav

from __future__ import unicode_literals
import yt_dlp
import ffmpeg
import sys

ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }],
    "outtmpl": 'yt_audio/audio',
}
def download_from_url(url):
    ydl.download([url])
    # stream = ffmpeg.input('output.m4a')
    # stream = ffmpeg.output(stream, 'output.wav')


with yt_dlp.YoutubeDL(ydl_opts) as ydl:
      url = "https://www.youtube.com/watch?v=01_RTOD9hYc" #@param {type:"string"}
      download_from_url(url)


In [None]:
#@title 6: Separate vocal from instrument

import subprocess
AUDIO_INPUT = "/content/yt_audio/audio.wav" #@param {type:"string"}

command = f"demucs --two-stems=vocals {AUDIO_INPUT}"
result = subprocess.run(command.split(), stdout=subprocess.PIPE)
print(result.stdout.decode())


In [None]:
#@title 7: Inference

from IPython.display import Audio

AUDIO = "/content/separated/htdemucs/audio/vocals.wav" #@param {type:"string"}
MODEL = "/content/model/jokowi/G_1528.pth" #@param {type:"string"}
CONFIG = "/content/model/jokowi/config.json" #@param {type:"string"}
#@markdown Change according to the voice tone. 12 = 1 Octave
PITCH_MOD = 0 #@param {type:"integer"}

!svc infer {AUDIO} -c {CONFIG} -m {MODEL} -na -t {PITCH_MOD}
# Try comment this line below if you got Runtime Error
try:
  display(Audio(f"{AUDIO}.out.wav", autoplay=True))
except Exception as e:  print("Error:", str(e))

In [None]:
#@title 8: Combine Vocal and Instrument (Song Cover)
!pip install pydub
from pydub import AudioSegment

FILENAME = "final_cover.mp3" #@param {type: "string"}
VOCAL = "/content/separated/htdemucs/audio/vocals.out.wav" #@param {type:"string"}
INSTRUMENT = "/content/separated/htdemucs/audio/no_vocals.wav" #@param {type:"string"}

sound1 = AudioSegment.from_file(VOCAL)
sound2 = AudioSegment.from_file(INSTRUMENT)

combined = sound1.overlay(sound2)
if not FILENAME.endswith(".mp3"): FILENAME = FILENAME + '.mp3'

combined.export(f"/content/{FILENAME}.mp3", format='mp3')
try:
  display(Audio(f"/content/{FILENAME}.mp3", autoplay=True))
except Exception as e:  print("Error:", str(e))

In [None]:
#@title Additional: Install module for audio recording

!pip install ffmpeg-python

In [None]:
#@title Record audio

"""
To write this piece of code I took inspiration/code from a lot of places.
It was late night, so I'm not sure how much I created or just copied o.O
Here are some of the possible references:
https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/
https://stackoverflow.com/a/18650249
https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/
https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/
https://stackoverflow.com/a/49019356
"""
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

In [None]:
audio, sr = get_audio()

In [None]:
from scipy.io import wavfile
FILENAME = "recording.wav" #@param {type:"string"}
wavfile.write(FILENAME, sr, audio)