In [51]:
import os
import pandas as pd
import tempfile
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import librosa 
from tqdm import tqdm
import multiprocessing

In [8]:

temp_dir = tempfile.gettempdir()
# common_voice_path=os.path.abspath("../transaccent/data/en")
common_voice_path=os.path.abspath("/data")
audio_files_path = os.path.join(common_voice_path, "clips")
data_file_filename = "validated.tsv"
data_file_path = os.path.join(common_voice_path, data_file_filename)

# Retreive records
df = pd.read_csv(data_file_path, sep='\t', low_memory=False)
df.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
0,0013037a1d45cc33460806cc3f8ecee9d536c45639ba4c...,common_voice_en_699711.mp3,She'll be all right.,2,1,,,
1,001509f4624a7dee75247f6a8b642c4a0d09f8be3eeea6...,common_voice_en_18132047.mp3,All's well that ends well.,2,0,,,
2,003fb666a99eb3aa3ba05d9c8641c18e55cf7d34d1b981...,common_voice_en_17263741.mp3,Do you mean it?,2,0,,,
3,004017ba82a23768d58dff3b91da8e8f951ea5fb6d3cd9...,common_voice_en_17893917.mp3,The new patch is less invasive than the old on...,2,1,,,
4,0047f1aea3f39c4c6a9298d84f046c1f84f439f594d840...,common_voice_en_17561821.mp3,How is Mozilla going to handle ambiguities lik...,2,0,,,


In [9]:
df = df[(df['down_votes'] < 1) & (df['gender'] == 'male') & (df['up_votes'] > 1)]
df.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
9,00c3f0e7c691ef30257d1bfa9adc410535b7ba3f48e344...,common_voice_en_18295850.mp3,The long-lived bridge still stands today.,2,0,twenties,male,
70,05ba9bb1a4ac391849fa4461547967768f4d7df8ee52d7...,common_voice_en_19967535.mp3,The cemetery is now managed by three trusts.,2,0,fifties,male,african
81,06546553aed17027b4e638d4afb56f39b216026088cf40...,common_voice_en_17147389.mp3,Women form less than half of the group.,2,0,twenties,male,us
100,0838a82655be5a61349c2d2d86b60c22b5b84fea9826cb...,common_voice_en_18127728.mp3,Sunburn can be avoided by applying sunscreen o...,2,0,twenties,male,
104,0899979e8d43a9faf448ddb5f4fc9a38a0fb4c120eaf34...,common_voice_en_17850951.mp3,Still waters run deep.,2,0,twenties,male,other


In [12]:
from data_util import build_speaker_dataset

In [13]:
indian_df = build_speaker_dataset(df[(df['accent'] == 'indian') ], 1)
indian_df.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
720300,d0b62ac1e57e267f16088ee614cdaf09a9826c3b704ce2...,common_voice_en_17246192.mp3,How do you know that?,2,0,twenties,male,indian
720301,d0b62ac1e57e267f16088ee614cdaf09a9826c3b704ce2...,common_voice_en_17246193.mp3,Where did it come from then?,3,0,twenties,male,indian
720302,d0b62ac1e57e267f16088ee614cdaf09a9826c3b704ce2...,common_voice_en_17246200.mp3,"Look, the seam is now broken, it couldn't stan...",2,0,twenties,male,indian
720303,d0b62ac1e57e267f16088ee614cdaf09a9826c3b704ce2...,common_voice_en_17246201.mp3,What did he do?,2,0,twenties,male,indian
720304,d0b62ac1e57e267f16088ee614cdaf09a9826c3b704ce2...,common_voice_en_17246202.mp3,"According to the dictionary, the word ""gizmo"" ...",2,0,twenties,male,indian


In [14]:
us_df = build_speaker_dataset(df[df['accent'] == 'us'], 1)
us_df.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
837785,b419faab633f2099c6405ff157b4d9fb5675219570f268...,common_voice_en_84125.mp3,I said I was trying to catch one.,2,0,thirties,male,us
837786,b419faab633f2099c6405ff157b4d9fb5675219570f268...,common_voice_en_84126.mp3,Judge may not think so.,2,0,thirties,male,us
837787,b419faab633f2099c6405ff157b4d9fb5675219570f268...,common_voice_en_84127.mp3,"Don‘t fight fire with fire, it would not help.",3,0,thirties,male,us
837788,b419faab633f2099c6405ff157b4d9fb5675219570f268...,common_voice_en_84128.mp3,But he'd have to.,3,0,thirties,male,us
837789,b419faab633f2099c6405ff157b4d9fb5675219570f268...,common_voice_en_84129.mp3,Whose turn is it?,2,0,thirties,male,us


In [15]:
print(f"Retrieved {len(indian_df)} records with indian accents and {len(us_df)} records with us accents")

Retrieved 576 records with indian accents and 7827 records with us accents


In [41]:
hop=192               #hop size (window size = 6*hop)
sr=16000              #sampling rate
min_level_db=-100     #reference values to normalize data
ref_level_db=20

shape=24              #length of time axis of split specrograms to feed to generator            
vec_len=128           #length of vector generated by siamese vector
# bs = 16               #batch size
bs = 32               #batch size
delta = 2.            #constant for siamese loss

In [42]:
class HParams:
    def __init__(self, **entries):
        self.__dict__.update(entries)

In [43]:
from yaml import load, dump
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper

with open("hparams.yaml", "r") as fp:
    hparams_yaml = load(fp, Loader=Loader)

hparams = HParams(**hparams_yaml)
hparams.n_fft = 6 * hparams.hop
hparams.win_length = 6 * hparams.hop
hparams.max_spec_length = 10 * hparams.shape 
hparams.max_audio_samples = 256 * 94

print(hparams.n_fft)
print(hparams.trim_top_db)
print(hparams.win_length)
print(hparams.max_spec_length)
print(hparams.max_audio_samples)

1152
40
1152
240
24064


In [44]:
def pad_audio_with_silence(wav, pad_value, hparams):
    if len(wav) < hparams.max_audio_samples:
        pad_samples = hparams.max_audio_samples - len(wav)
        wav = np.pad(wav, (0, pad_samples), mode='constant', constant_values=pad_value)
    
    return wav

def trim_silence(wav, hparams):
    return librosa.effects.trim(wav, top_db=hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0]

def load_audio(audio_file_name, hparams):
    # The loading of mp3 files throw a warning, due to the volume of files we ignore this warning
    import warnings
    warnings.filterwarnings('ignore')
    y, _= librosa.load(audio_file_name, sr=hparams.sample_rate)
    return y

def convert_mp3_to_signal(audio_file_name, hparams):
    y = load_audio(audio_file_name, hparams)
    if hparams.trim_silence:
        y = trim_silence(y, hparams)
    # pad to try to make all the audio files the same length
    y = pad_audio_with_silence(y, pad_value=0., hparams=hparams)
    return y

def process_audio(audio_path, hparams):
    return np.array(convert_mp3_to_signal(os.path.join(audio_files_path, audio_path), hparams))

def audio_array_for_accent(df, accent, hparams, limit_size=-1, tqdm=lambda x: x):
    import multiprocessing
    import numpy as np
    cpu_count = multiprocessing.cpu_count()
    
    input_mp3_files = df[df['accent'] == accent]['path'].to_numpy() 
    
    if limit_size > 0:
        print(f"Retrieved {len(input_mp3_files)} will only be using {limit_size}")
        input_mp3_files = input_mp3_files[:limit_size]
    
    executor = ProcessPoolExecutor(max_workers=cpu_count)
    futures = []
    index = 1
    
    for filepath in input_mp3_files:
        futures.append(executor.submit(partial(process_audio, filepath, hparams)))
        index += 1
    
    return np.array([future.result() for future in tqdm(futures) if future.result() is not None])



In [45]:
def melspecfunc(waveform, hparams):
    return librosa.feature.melspectrogram(y=waveform, 
                                          sr=hparams.sample_rate, 
                                          n_fft=hparams.n_fft, 
                                          win_length=hparams.win_length,
                                          hop_length=hparams.hop)

In [46]:
def normalize(S, hparams):
    return np.clip((((S - hparams.min_level_db) / -hparams.min_level_db)*2.)-1., -1, 1)

def prep(wv, hparams):
    S = np.array(melspecfunc(wv, hparams))
    S = librosa.power_to_db(S)-hparams.ref_level_db
    return normalize(S, hparams)

In [52]:
def process_wav(wav, hparams):
    S = np.array(prep(wav, hparams), dtype=np.float32)
    return np.expand_dims(S, -1)

def tospec(wvs, hparams):
    cpu_count = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(cpu_count)
    specs = pool.map(partial(process_wav, hparams=hparams), wvs)
    
    return specs

In [56]:
# split into equal chunk size
def split_spec(spec, chunk_size):
    return [spec[i * chunk_size: (i+1) * chunk_size] for i in range(int(np.ceil(len(spec) / chunk_size)))]

def splitcut(data, hparams):
    chunk_size = hparams.max_spec_length                                                             #max spectrogram length
    
    cpu_count = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(cpu_count)
    splits = pool.map(partial(split_spec,chunk_size=chunk_size), data)
    
    return np.array(splits)

In [57]:
input_accent = 'indian'
target_accent = 'us'

In [58]:
awv = audio_array_for_accent(indian_df, input_accent, hparams, limit_size=len(indian_df))
aspec = tospec(awv, hparams)
adata = splitcut(aspec, hparams)

Retrieved 576 will only be using 576


ValueError: could not broadcast input array from shape (128,126,1) into shape (128)

In [None]:
bwv = audio_array_for_accent(us_df, target_accent, hparams, limit_size=len(us_df))
bspec = tospec(bwv, hparams)
bdata = splitcut(bspec, hparams)

In [None]:
indian_dir = '/data/wav/v2/indian'
us_dir = '/data/wav/v2/us'

In [None]:
np.savez(f'{indian_dir}/data.npz', spec=aspec, data=adata)

In [None]:
np.savez(f'{indian_dir}/data.npz', spec=bspec, data=bdata)