In [None]:
# PREP STEP 0
# to begin with, we have 10 total speakers, for each we have:
# - 1 full clip of their real voice
# - 19 sentence clips of their cloned voice reading the rainbow passage
# - 12 sentence clips of their cloned voice reading two other passages
#   TODO: what are we using for testing vs training? I think there is a 
#         misunderstanding with the data... I think intuitively, the
#         "novel" data to use for testing is different voices. So, in this
#         case, we either need more voices, or we can put aside 2 speakers
#         for testing and 1 speaker for validating.

# data directory structure
# |-- raw_data
# |   |-- Speaker 1 Real
# |   |-- Speaker 1 Rainbow Passage
# |   |-- Speaker 1 Test/Validation
# |   |-- ...
# |   |-- Speaker 10 Real
# |   |-- Speaker 10 Rainbow Passage
# |   |-- Speaker 10 Test/Validation



# TODO: Zack's responsibility
# PREP STEP 1
# pick 2 speakers for test data
# pick 1 speaker for validation data
# take all data and use forced aligner charsiu to filter out all fricatives
# store all clips of fricatives as wav files

# data directory structure
# |-- raw_data
# |-- wav_data
# |   |-- test
# |   |   |-- clone
# |   |   |   |-- s9_f1.wav 
# |   |   |   |-- s9_f2.wav 
# |   |   |   |-- ... 
# |   |   |   |-- s10_f1.wav 
# |   |   |   |-- s10_f2.wav 
# |   |   |   |-- ... 
# |   |   |-- natur
# |   |   |   |-- s9_f1.wav 
# |   |   |   |-- s9_f2.wav 
# |   |   |   |-- ... 
# |   |   |   |-- s10_f1.wav 
# |   |   |   |-- s10_f2.wav 
# |   |   |   |-- ... 
# |   |-- train
# |   |   |-- clone (clips for s1-s7)
# |   |   |-- natur (clips for s1-s7)
# |   |-- val
# |   |   |-- clone (clips for s8)
# |   |   |-- natur (clips for s8)



# PREP STEP 2
# take all wav files and convert to spectrograms, keep exact structure
# use librosa to convert wav files to melspectrograms
# manually normalize and convert to image, save

# data directory structure
# |-- raw_data
# |-- wav_data
# |-- spg_data
# |   |-- test
# |   |   |-- clone
# |   |   |   |-- s9_f1.png
# |   |   |   |-- s9_f2.png 
# |   |   |   |-- ... 
# |   |   |   |-- s10_f1.png 
# |   |   |   |-- s10_f2.png 
# |   |   |   |-- ... 
# |   |   |-- natur
# |   |   |   |-- s9_f1.png
# |   |   |   |-- s9_f2.png 
# |   |   |   |-- ... 
# |   |   |   |-- s10_f1.png 
# |   |   |   |-- s10_f2.wav 
# |   |   |   |-- ... 
# |   |-- train
# |   |   |-- clone (clips for s1-s7)
# |   |   |-- natur (clips for s1-s7)
# |   |-- val
# |   |   |-- clone (clips for s8)
# |   |   |-- natur (clips for s8)

In [13]:
# imports
import librosa
from PIL import Image
import numpy as np
import os

import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

In [8]:
# organize directories ----------------------------------------------------
cwd = os.getcwd()
# stores test fricative waveform files for all sentences and speakers
wav_test_dir = cwd + '/wav_data/test/'

# stores train fricative waveform files for all sentences and speakers
wav_train_dir = cwd + '/wav_data/train/'

# stores val fricative waveform files for all sentences and speakers
wav_val_dir = cwd + '/wav_data/val/'

# stores test fricative spectrogram images for all sentences and speakers
spg_test_dir = cwd + '/spg_data/test/'

# stores train fricative spectrogram images for all sentences and speakers
spg_train_dir = cwd + '/spg_data/train/'

# stores val fricative spectrogram images for all sentences and speakers
spg_val_dir = cwd + '/spg_data/val/'

In [None]:
# helper functions --------------------------------------------------------
def normalize_to_img(spect):
  '''
  this function normalizes all values in spectrogram to 0-255
    INPUTS
      spect   : spectrogram
    OUTPUTS
      normalized spect
  '''
  return (spect - spect.min()) / (spect.max() - spect.min()) *255


# TODO: check if this spectrogram output is as expected
# TODO: learn about how to choose spectrogram parameters
# TODO: tangential to parameters, make sure that short waveforms will not
#       affect spectrogram quality, or at least how do we choose params
#       to properly scale the resolution of the spectrogram
# TODO: check what librosa melspectrogram output is
# TODO: look into if it is possible to save librosa.disaply output directly
#       as image, in my inital search, this was not obvious
# TODO: look into torchvision.transforms.spectrograms, might be better to
#       just use this because I think it saves as tensor (is a tensor 
#       what we want essentially?)
def get_spectrograms(wav_dir, spg_dir):
  '''
  this function creates mel spectrogram in db for all waveforms in a folder
    INPUTS
      wav_dir   : directory name of waveforms
      spect_dir : directory name of where to store spectrograms
    OUTPUTS
      none
  '''
  for file in list(os.listdir(wav_dir)):
    # load wav file
    y, sr = librosa.load(wav_dir + file, sr=22050)

    # convert to melspg in db
    spect = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=512)
    spect = librosa.amplitude_to_db(spect, ref=np.max)

    # remove single-dimension for clarity
    # normalize to pixel val 0-255 and typecast to uint8
    spect_img = normalize_to_img(spect).astype(np.uint8)

    # flip to correct y-axis, frequencies from low -> high
    spect_img = np.flip(spect_img)

    # invert pixels s.t. more energy is represented by darker pixels
    spect_img = 255-spect_img

    # create image
    im = Image.fromarray(spect_img)
    # save under same filename
    fname = file.split('.')
    im.save(spg_dir + fname[0]+'.png')

In [11]:
# get spg for all wav -----------------------------------------------------
# for test data
get_spectrograms(wav_test_dir+'clone/', spg_test_dir+'clone/')
get_spectrograms(wav_test_dir+'natur/', spg_test_dir+'natur/')
# for train data
get_spectrograms(wav_train_dir+'clone/', spg_train_dir+'clone/')
get_spectrograms(wav_train_dir+'natur/', spg_train_dir+'natur/')
# for val data
get_spectrograms(wav_val_dir+'clone/', spg_val_dir+'clone/')
get_spectrograms(wav_val_dir+'natur/', spg_val_dir+'natur/')

In [14]:
# create datasets and dataloaders -----------------------------------------
train_ds = datasets.ImageFolder(root=spg_train_dir, 
                                transform=transforms.ToTensor())
train_loader = DataLoader(train_ds, batch_size=64)

val_ds = datasets.ImageFolder(root=spg_val_dir, 
                                transform=transforms.ToTensor())
val_loader = DataLoader(val_ds, batch_size=64)