<a href="https://colab.research.google.com/github/korneelvdbroek/mp3net/blob/main/colab/dataprep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Description
This script takes a set of audio files and prepares them to be used as training data for MP3net. 

### Usage
The script below assumes you store the program code on Google Drive and audio data on gs:// To use this notebook, check the cells below for capitalized tags which you will need to personalize.


In [None]:
# check location of backend
import subprocess
import json
proc=subprocess.Popen('curl ipinfo.io', shell=True, stdout=subprocess.PIPE, )
ip_data = json.loads(proc.communicate()[0])
server_country = ip_data['country']
print(f"Server location:   {ip_data['city']} ({ip_data['region']}), {server_country}\n")

In [None]:
project_id = 'YOUR_PROJECT_ID'
!gcloud config set project {project_id}

# connect to gs://
from google.colab import auth
auth.authenticate_user()

# Connect to Google Drive 
# The program code is assumed to be on Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

# Set environment variable so service accounts gets access to bucket (needed for gspath)
# (for more info see: https://cloud.google.com/docs/authentication/getting-started)
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/content/gdrive/JSON_WITH_SERVICE_ACCOUNT_PRIVATE_KEYS"

In [None]:
### ======================== RUN PARAMETERS ======================= ###
###                                                                 ###
# dict with bucket-region pairs
BUCKETS = {'gs://YOUR_BUCKET_NAME/': ['US']}   

# Location and type of source files (on gs://...)
REMOTE_INPUT_FILEPATH = 'FILEPATH_TO_INPUT_FILES' # don't preface with gs://YOUR_BUCKET_NAME
INPUT_FILE_EXTENSION = 'mp4'
INPUT_BATCH_SIZE = 42   # number of input files to be batched into one .tfrecord file (target 400MiB .tfrecord file)

# Destination where .tfrecord files will be written (on gs://...)
DATA_DIR = 'FILEPATH_OF_TFRECORD_FILES' # don't preface with gs://YOUR_BUCKET_NAME

# Local directory on backend (probably needs a High-RAM runtime type)
LOCAL_INPUT_FILES = 'local/'
###                                                                 ###
### =============================================================== ###

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
print(f"TensorFlow v{tf.__version__}")

In [None]:
import re

# select target bucket, based on country of backend (avoid e-gress!!!)
target_bucket = None
for bucket, country_lst in BUCKETS.items():
  if server_country in country_lst:
    target_bucket = bucket
    break
if target_bucket is None: 
  raise ValueError(f'No target-bucket found for {server_country}')
print(f"Target-bucket:     {target_bucket}")

# add target-bucket to directories
DATA_DIR = target_bucket + DATA_DIR
REMOTE_INPUT_FILEPATH = target_bucket + REMOTE_INPUT_FILEPATH

In [None]:
# install modules used by the code
!pip install tensorboardx
!pip install soundfile
!pip install tensorflow_addons
!pip install pytube

In [None]:
# Make sure python finds the imports
import sys
sys.path.append('/content/gdrive/PATH_TO/audiocodec')
sys.path.append('/content/gdrive/PATH_TO/mp4net')
sys.path.append('/content/gdrive/PATH_TO/preprocessing')

# local install of audiocodec (only needs to be executed once)
!pip install -e /content/gdrive/PATH_TO/audiocodec

In [None]:
# Copy input data -> local server
#   (only do this when data is not already on local server)
!mkdir ./{LOCAL_INPUT_FILES}
!gsutil -m cp {REMOTE_INPUT_FILEPATH}/* ./{LOCAL_INPUT_FILES}


In [None]:
# ######### #
# DATA PREP #
# ######### #
#
import datetime 
from utils import gspath
from utils import audio_utils
from model import mp4net
import dataprep  

in_filepath = LOCAL_INPUT_FILES
input_file_extension = INPUT_FILE_EXTENSION
out_filepath = DATA_DIR

model = mp4net.MP4netFactory()

temp_filepath = 'local_process/'
!mkdir {temp_filepath}
!rm {temp_filepath}*.*

# group input files in batches
file_pattern = gspath.join(in_filepath, f"*.{input_file_extension}")
audio_file_paths = gspath.findall(file_pattern)
audio_file_paths.sort()

input_batch_size = INPUT_BATCH_SIZE
input_files_batched = [audio_file_paths[i:i + input_batch_size] 
                       for i in range(0, len(audio_file_paths), input_batch_size)]

# loop over batches
for batch_no, batch in enumerate(input_files_batched):
  print()
  print(f'batch {batch_no}')
  tf_output_filename = gspath.join(out_filepath, f'yt-{batch_no:04d}' + f'_sr{model.sample_rate}_Nx{model.freq_n}x{model.channels_n}.tfrecord')

  if gspath.findall(tf_output_filename):
    # skip if output file already exists (maybe from earlier run that crashed)
    print(f'  Output file {tf_output_filename} already exists...')
  else:
    # loop over all songs in batch
    temp_wavs = []
    for song_no, song_filename in enumerate(batch):
      # convert and resample to WAV
      temp_wavfile = temp_filepath + f'yt-{batch_no:04d}-{song_no:02d}.wav'   
      temp_wavs.append(temp_wavfile)
      print(f'  resampling to {model.sample_rate}Hz: {song_filename} -> {temp_wavfile}')

      !ffmpeg -loglevel quiet -i {song_filename} -ar {model.sample_rate} {temp_wavfile} 

    # loop over all songs in batch
    print(f"  {datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')}: {tf_output_filename} <-- {temp_wavs}")

    # convert to tf-record
    dataprep.audio2tfrecord(temp_wavs, tf_output_filename, model)

  !rm {temp_filepath}*.*