<a href="https://colab.research.google.com/github/iamsusiep/slp2019/blob/master/remove_music.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook uses the Spleeter library to extract music from trailer audio clips. The installation is a bit wonky to ensure that a GPU was used by Tensorflow on Google Colab. 

In [0]:
# installing this way is done to ensure the GPU is used, rather than the CPU
# this drastically speeds up splitting!
!pip install tensorflow-gpu==1.14
!pip install pandas==0.25.1 requests museval==0.3.0 musdb==0.3.1 norbert==0.2.1 ffmpeg-python importlib-resources
!pip install spleeter --no-deps

Collecting tensorflow-gpu==1.14
[?25l  Downloading https://files.pythonhosted.org/packages/76/04/43153bfdfcf6c9a4c38ecdb971ca9a75b9a791bb69a764d652c359aca504/tensorflow_gpu-1.14.0-cp36-cp36m-manylinux1_x86_64.whl (377.0MB)
[K     |████████████████████████████████| 377.0MB 37kB/s 
Collecting tensorboard<1.15.0,>=1.14.0
[?25l  Downloading https://files.pythonhosted.org/packages/91/2d/2ed263449a078cd9c8a9ba50ebd50123adf1f8cfbea1492f9084169b89d9/tensorboard-1.14.0-py3-none-any.whl (3.1MB)
[K     |████████████████████████████████| 3.2MB 29.7MB/s 
Collecting tensorflow-estimator<1.15.0rc0,>=1.14.0rc0
[?25l  Downloading https://files.pythonhosted.org/packages/3c/d5/21860a5b11caf0678fbc8319341b0ae21a07156911132e0e71bffed0510d/tensorflow_estimator-1.14.0-py2.py3-none-any.whl (488kB)
[K     |████████████████████████████████| 491kB 72.6MB/s 
[31mERROR: tensorflow 1.15.0 has requirement tensorboard<1.16.0,>=1.15.0, but you'll have tensorboard 1.14.0 which is incompatible.[0m
[31mERROR: te

Collecting spleeter
  Downloading https://files.pythonhosted.org/packages/d0/8d/3951d180a89ea4d860d3a36bb505b22a2737a3bd28400ab51c901b070d99/spleeter-1.4.3.tar.gz
Building wheels for collected packages: spleeter
  Building wheel for spleeter (setup.py) ... [?25l[?25hdone
  Created wheel for spleeter: filename=spleeter-1.4.3-cp36-none-any.whl size=44129 sha256=b6847abf028a4ff20c948cdf75ed55e7358d37c42efee431207646612345ecb3
  Stored in directory: /root/.cache/pip/wheels/1f/d2/db/24b01f77333fa5446dc73d98b93f1163de91eb2d0d6a63395b
Successfully built spleeter
Installing collected packages: spleeter
Successfully installed spleeter-1.4.3


In [0]:
import numpy as np
import pandas as pd
import glob
from os.path import basename

from google.colab import drive
drive.mount('/content/gdrive')

! mkdir '/content/gdrive/My Drive/no_music'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
mkdir: cannot create directory ‘/content/gdrive/My Drive/no_music’: File exists


In [0]:
# load in CSVs
# first is youtube trailer links; second is IMDB ids for those movies
# third is their average ratings and num votes
# fourth is basic film metadata
youtubeIDs = pd.read_csv('/content/gdrive/My Drive/ml-youtube.csv') # https://grouplens.org/datasets/movielens/20m-youtube/
links = pd.read_csv('/content/gdrive/My Drive/links.csv') # http://files.grouplens.org/datasets/movielens/ml-20m-README.html
titleRatings = pd.read_csv('/content/gdrive/My Drive/title.ratings.tsv', sep='\t', usecols = ['tconst', 'numVotes']) # https://datasets.imdbws.com/
titleRatings['tconst'] = titleRatings['tconst'].str[2:].astype(int)
titleBase = pd.read_csv('/content/gdrive/My Drive/title.basics.tsv', sep='\t', usecols = ['tconst', 'runtimeMinutes']) # https://datasets.imdbws.com/
titleBase['tconst'] = titleBase['tconst'].str[2:].astype(int)

# merge them together
yt_links = pd.merge(youtubeIDs, links, on = 'movieId')
yt_links_ratings = pd.merge(yt_links, titleRatings, left_on='imdbId', right_on = 'tconst') 
final = pd.merge(yt_links_ratings, titleBase, left_on='tconst', right_on = 'tconst')
final = final[(final['numVotes'] > 5000) & (final['runtimeMinutes'].str.replace("\\N", "-1", regex = False).astype(int) > 40)]

# free up some memory
del youtubeIDs
del links
del titleRatings
del titleBase
del yt_links_ratings
del yt_links

In [0]:
# get raw WAV audio files
yt_links = final['youtubeId'].values
samples = [f for f in glob.glob("/content/gdrive/My Drive/audio/*.wav") for yt_link in yt_links if yt_link in f]

In [0]:
# get already separated WAV files
from os.path import basename
existing_files = glob.glob("/content/gdrive/My Drive/no_music/*.wav")
pulledAudio = {}
for f in existing_files:
  if 'no_music_' in f:
    pulledAudio[basename(f).split('.wav')[0].split('no_music_')[1]] = None
  else:
    pulledAudio[basename(f).split('.wav')[0]] = None

In [0]:
from spleeter.utils.audio.adapter import get_default_audio_adapter
from spleeter.separator import Separator

# Using embedded configuration.
separator = Separator('spleeter:2stems')
# audio_loader = get_default_audio_adapter()
sample_rate = 44100

# only run splitting for unseparated WAV files
for sample in samples:
  try:
    yt_link = next(yt for yt in yt_links if yt in sample)
  except StopIteration:
    continue
  if not any([yt_link in x for x in pulledAudio]):
    output_file = '/content/gdrive/My Drive/no_music/' + 'no_music_' + yt_link + '.wav'
    # waveform, _ = audio_loader.load(sample, sample_rate=sample_rate)
    # prediction = separator.separate(waveform)
    # audio_loader.save('/content/gdrive/My Drive/no_music/' + 'no_music_' + yt_link + '.wav',
    #                   prediction['vocals'], sample_rate)
    try:
      separator.separate_to_file(sample, '/content/gdrive/My Drive/no_music/' + 'no_music_' + yt_link + '.wav', duration = 180)
    except:
      continue

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


INFO:spleeter:Downloading model archive https://github.com/deezer/spleeter/releases/download/v1.4.0/2stems.tar.gz
INFO:spleeter:Extracting downloaded 2stems archive
INFO:spleeter:2stems model file(s) extracted


INFO:tensorflow:Using config: {'_model_dir': 'pretrained_models/2stems', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.7
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f72ca593240>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.

INFO:tensorflow:Apply unet for vocals_spectrogram
INFO:tensorflow:Apply unet for accompaniment_spectrogram
INFO:tensorflow:Done calling mode

INFO:spleeter:File /content/gdrive/My Drive/no_music/no_music_N2dtpnUk4ls.wav/accompaniment.wav written
INFO:spleeter:File /content/gdrive/My Drive/no_music/no_music_N2dtpnUk4ls.wav/vocals.wav written
INFO:spleeter:File /content/gdrive/My Drive/no_music/no_music_l15ydlXoHt8.wav/accompaniment.wav written
INFO:spleeter:File /content/gdrive/My Drive/no_music/no_music_l15ydlXoHt8.wav/vocals.wav written
INFO:spleeter:File /content/gdrive/My Drive/no_music/no_music_m3q2YzKp138.wav/accompaniment.wav written
INFO:spleeter:File /content/gdrive/My Drive/no_music/no_music_m3q2YzKp138.wav/vocals.wav written
INFO:spleeter:File /content/gdrive/My Drive/no_music/no_music_mXn8CsQU1-U.wav/accompaniment.wav written
INFO:spleeter:File /content/gdrive/My Drive/no_music/no_music_mXn8CsQU1-U.wav/vocals.wav written
INFO:spleeter:File /content/gdrive/My Drive/no_music/no_music_pKEP4yQxvOA.wav/accompaniment.wav written
INFO:spleeter:File /content/gdrive/My Drive/no_music/no_music_pKEP4yQxvOA.wav/vocals.wav wri

In [0]:
# replace the folders that have been created and just get the vocals
import shutil
import os
subfolders = [f.path for f in os.scandir('/content/gdrive/My Drive/no_music/') if f.is_dir() ]    
for subfolder in subfolders:
  folder, filename = subfolder.split('no_music_')
  try:
    shutil.move(subfolder + '/vocals.wav',  folder + filename)
    print(folder + filename)
  except:
    pass

/content/gdrive/My Drive/no_music/N2dtpnUk4ls.wav
/content/gdrive/My Drive/no_music/l15ydlXoHt8.wav
/content/gdrive/My Drive/no_music/m3q2YzKp138.wav
/content/gdrive/My Drive/no_music/mXn8CsQU1-U.wav
/content/gdrive/My Drive/no_music/pKEP4yQxvOA.wav
/content/gdrive/My Drive/no_music/q6yObNphIPE.wav
/content/gdrive/My Drive/no_music/q-jlmaYX2n0.wav
/content/gdrive/My Drive/no_music/r0MtQ1s_A_4.wav
/content/gdrive/My Drive/no_music/rEbM3tjV-3Y.wav
/content/gdrive/My Drive/no_music/ssQu16utnR8.wav
/content/gdrive/My Drive/no_music/t861z2ww0-Q.wav
/content/gdrive/My Drive/no_music/tGEveEqpjYk.wav
/content/gdrive/My Drive/no_music/tzwQWUYtiUE.wav
/content/gdrive/My Drive/no_music/v8y0YIQrhNw.wav
/content/gdrive/My Drive/no_music/Py8oGU5CLn0.wav
/content/gdrive/My Drive/no_music/vb398BOlv0Q.wav
/content/gdrive/My Drive/no_music/xqwlIaOyBSA.wav
/content/gdrive/My Drive/no_music/yyhRIQdX7kI.wav
/content/gdrive/My Drive/no_music/z4zNSx42aEo.wav
/content/gdrive/My Drive/no_music/RJBfsiqc98o.wav


In [0]:
subfolders = [f.path for f in os.scandir('/content/gdrive/My Drive/no_music/') if f.is_dir() ]    
for subfolder in subfolders:
  !rm -rf {subfolder.replace(' ','\ ')}