In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import numpy as np
import pandas as pd

import librosa
import IPython.display as ipd

from afritts_text_normalizer import tts_cleaner

In [3]:
dir_path = os.getcwd()

In [4]:
train = pd.read_csv(os.path.join(dir_path, "..", "data/intron-tts-train-public-28565.csv"))
dev = pd.read_csv(os.path.join(dir_path, "..", "data/intron-tts-dev-public-3330.csv"))
test = pd.read_csv(os.path.join(dir_path, "..", "data/intron-tts-test-public-4161.csv"))

data = pd.concat([train, dev, test])


In [17]:
not_exist = []
for i, item in data.iterrows():
    src = os.path.join(dir_path, "..", "afrispeech_16k_trimmed",
                       item.audio_paths[1:])
    if not os.path.exists(src):
        not_exist.append(item.audio_paths)

In [18]:
data = data[~data.audio_paths.isin(not_exist)]

In [19]:
len(not_exist)

14

In [20]:
# remove 3 missing files
data = data[data.audio_ids != "24bfceabfa102cc1c1926d2049f56bbf"]
data = data[data.audio_ids != "790cdbb7907d0112bc737f0b06282dde"]
data = data[data.audio_ids != "4bffcdb03a445616d70eb1a859ac52a5"]

# Remove utterances with long duration, mostly noisy
data = data[data.duration <= 50.0].copy()

In [21]:
train.user_ids.nunique(), dev.user_ids.nunique(), test.user_ids.nunique()

(600, 76, 75)

In [22]:
data.user_ids.nunique(), data.duration.sum()/3600  # 751 speaker, 135 hours

(751, 133.57556944444445)

In [23]:
# data[data.duration > 50.0].duration.sum() / 3600

In [24]:
len(data), data.audio_ids.nunique()

(35992, 35992)

In [12]:
# percentage of utterances with numbers

import re
def has_numbers(inputString):
    return bool(re.search(r'\d', inputString))

data["is_alnum"] = data.transcript.apply(lambda x: has_numbers(x))

In [13]:
data[data.is_alnum == True].duration.sum() / 3600 # 42 hours contain numbers, ~ 40%

41.340895

In [25]:
split = data # train, dev, test
speakers = pd.pivot_table(data, values="duration",
              columns=["user_ids"], aggfunc="sum").T.reset_index()

speakers["duration"] = speakers["duration"]# /3600

speakers = speakers.sort_values("duration", ascending=False).reset_index(drop=True)
speakers.head(20)

Unnamed: 0,user_ids,duration
0,2db35217e91d8889c6f6f4bdee66a52f,5002.34
1,73f971bbef38880a86ac97680ab5a7f8,4929.179
2,6b5cd67595796bf5489893b69755f43f,4834.86
3,1269ac57cb62784a2abeec0b904ed615,4796.768
4,b883f5d99b7dfc05fc7bbeca0f926e9f,4675.044
5,f5a8238265acd295b474e3b67cde1dd5,4665.078
6,96ca0f339408b47bbed4b4ed6fdc4460,4570.077
7,a35a75f5541bce9a15063660e1028215,4506.899
8,2a477d9f7d285736be740fe77180b9c0,4197.769
9,bef1fe43fb65dfc614c001baeef2a381,4123.003


In [26]:
speakers.tail(10)

Unnamed: 0,user_ids,duration
741,b545a4ca235a7b72688a1c0b3eb6bde6,8.181
742,2493cba45cdf00e461b67e78d518a910,7.706
743,633ea93e9a3eb22f92e6c522c11f1f52,7.559
744,df4a2d9487db6e98bf0d8825220f9bf4,7.391
745,ce339a60ecffc5a72db4bd2489d206cc,7.379
746,1a1b8d8aace7c96a29d6b32d74ed90fd,7.311
747,2bb382f192f6a57300bdb0ef6fbd8ad3,7.077
748,eb5df6c57ad707440696d8478e655d78,6.603
749,c73de2fe9d445a8d192c5aac49a0e0bb,5.773
750,6b76a3d99dd1fe093782ef197cb92816,5.035


In [27]:
speakers_dict = {}
for i, item in speakers.iterrows():
    speakers_dict[item.user_ids] = item.duration

In [28]:
accents = pd.pivot_table(data, values="duration",
              columns=["accent"], aggfunc="sum").T.reset_index()
accents["duration"] = accents["duration"]
accents = accents.sort_values("duration", ascending=False).reset_index(drop=True)

In [29]:
accents.head(10) # don't forget to drop unknown accent

Unnamed: 0,accent,duration
0,Yoruba,113126.218
1,Hausa,110536.472
2,Unknown,36896.805
3,Swahili,36781.011
4,Igbo,28389.805
5,Zulu,23152.075
6,Ijaw,17126.945
7,Idoma,10673.228
8,Igala,9229.985
9,Afrikaans,7482.533


In [30]:
accents.tail(20)

Unnamed: 0,accent,duration
60,Bajju,182.389
61,Tula,167.667
62,Dera,166.638
63,Chichewa,149.858
64,Tsonga,124.148
65,Ogbia,78.476
66,Swati,77.642
67,Otjiherero,61.7
68,Ikulu,54.912
69,Lunguda,42.362


In [31]:
split = data # train, dev, test
speakers2 = pd.pivot_table(data, values="duration",
              columns=["country", "accent", "gender"], aggfunc="sum").T.reset_index()

speakers2["duration"] = speakers2["duration"]# /3600

speakers2 = speakers2.sort_values("duration", ascending=False).reset_index(drop=True)
speakers2.head(20)

Unnamed: 0,country,accent,gender,duration
0,NG,Yoruba,Male,63328.08
1,NG,Hausa,Male,57076.143
2,NG,Hausa,Female,53003.214
3,NG,Yoruba,Female,49789.957
4,KE,Swahili,Female,23695.523
5,ZA,Zulu,Female,22761.593
6,NG,Igbo,Female,17824.062
7,NG,Ijaw,Male,11829.03
8,KE,Unknown,Female,10855.926
9,NG,Igbo,Male,10565.743


In [32]:
speakers2.tail(20)

Unnamed: 0,country,accent,gender,duration
101,NG,Ikulu,Female,54.912
102,NG,Pidgin,Female,53.226
103,KE,Kikuyu,Female,51.627
104,UG,Luganda,Female,42.84
105,NG,Lunguda,Male,42.362
106,NG,Etsako,Female,34.521
107,NG,Ogba,Female,34.218
108,NG,"Hausa, Fulani",Male,33.247
109,NG,Anaang,Male,31.14
110,ZA,Portuguese,Male,30.236


In [33]:
# Test set: 4 speakers with 4 different accents
test_data_unseen = data[data.accent.isin(["Swati", "Portuguese", "Ogbia", "Lunguda"])]

In [34]:
len(test_data_unseen)

16

In [35]:
data = data[~data.accent.isin(["Swati", "Portuguese", "Ogbia", "Lunguda"])]

data["user_ids_num"] = pd.factorize(data["user_ids"])[0]

    A value is trying to be set on a copy of a slice from a DataFrame.
    Try using .loc[row_indexer,col_indexer] = value instead
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
      data["user_ids_num"] = pd.factorize(data["user_ids"])[0]
    


In [36]:
# select 2 utterances per user_id as test set
from collections import defaultdict

# shuffle
data_sh = data.sample(frac=1, random_state=0).reset_index(drop=True)

user_id_map = defaultdict(int)
test_data_seen = []
train_val_data_seen = []

for i, item in data_sh.iterrows():
    
    # Only test on speaker with at least 5 mins data
    if speakers_dict[item.user_ids] <= 300 or user_id_map[item.user_ids] == 2:
        train_val_data_seen.append(item)
        continue
    
    # if item.duration >= 1.0: # speaker embedding need 1 sec for extraction.
    test_data_seen.append(item)
    user_id_map[item.user_ids]+=1
    
test_data_seen = pd.DataFrame(test_data_seen)
train_val_data_seen = pd.DataFrame(train_val_data_seen)

In [37]:
len(test_data_seen), len(train_val_data_seen), len(test_data_seen)+len(train_val_data_seen), len(data_sh)


(646, 35330, 35976, 35976)

In [38]:
train_val_data_seen = train_val_data_seen.sample(frac=1, random_state=0).reset_index(drop=True)
val = train_val_data_seen[0: 200]
train = train_val_data_seen[200: ]

In [39]:
# test_data_unseen.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-test-unseen.csv", index=False)

# train.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-train.csv", index=False)
# val.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-dev.csv", index=False)

# test_data_seen.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-test-seen.csv", index=False)


Normalize sentences


In [40]:
test_data_unseen["transcript"] = test_data_unseen["transcript"].apply(tts_cleaner)

train["transcript"] = train["transcript"].apply(tts_cleaner)
val["transcript"] = val["transcript"].apply(tts_cleaner)
test_data_seen["transcript"] = test_data_seen["transcript"].apply(tts_cleaner)

    A value is trying to be set on a copy of a slice from a DataFrame.
    Try using .loc[row_indexer,col_indexer] = value instead
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
      test_data_unseen["transcript"] = test_data_unseen["transcript"].apply(tts_cleaner)
    
    A value is trying to be set on a copy of a slice from a DataFrame.
    Try using .loc[row_indexer,col_indexer] = value instead
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
      train["transcript"] = train["transcript"].apply(tts_cleaner)
    
    A value is trying to be set on a copy of a slice from a DataFrame.
    Try using .loc[row_indexer,col_indexer] = value instead
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

In [41]:
test_data_unseen.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-test-unseen-clean.csv", index=False)

train.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-train-clean.csv", index=False)
val.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-dev-clean.csv", index=False)

test_data_seen.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-test-seen-clean.csv", index=False)

In [26]:
import pandas as pd

In [39]:
test_data_unseen = pd.read_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-test-unseen-clean.csv",)

train = pd.read_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-train-clean.csv",)
val = pd.read_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-dev-clean.csv",)

test_data_seen = pd.read_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-test-seen-clean.csv",)


In [40]:
data = data[["audio_ids", "transcript"]].copy()

In [41]:
test_data_unseen = pd.merge(test_data_unseen, data, on="audio_ids", how="left")
train = pd.merge(train, data, on="audio_ids", how="left")
val = pd.merge(val, data, on="audio_ids", how="left")
test_data_seen = pd.merge(test_data_seen, data, on="audio_ids", how="left")

In [42]:
test_data_seen

Unnamed: 0,idx,user_ids,accent,age_group,gender,country,transcript_x,nchars,audio_ids,audio_paths,duration,neg_percent,origin,domain,split,expand_puncts,user_ids_num,transcript_y
0,31519,78687bf17a592e56cf155c492ffa0603,Yoruba,19-25,Female,NG,"ijeoma mister onyechere, bassir close bracket ...",92,2e96d70e5b89f5d5d02a1d051e570b43,/AfriSpeech-TTS-D/train/1d98be50-152f-42d5-940...,13.035,1.00,nigerian,general,train,True,438,"Ijeoma Mr Onyechere, Bassir (creator of the Si..."
1,29584,f511698f6e5c249e043cba47cfbf9836,Yoruba,26-40,Female,NG,uids are often referred to by the placeholder ...,102,9afae62b4cb4aa1eb2dde535121f21cc,/AfriSpeech-TTS-D/train/9ab16298-9988-4f34-9f3...,11.357,0.25,nigerian,general,train,False,428,UIDs are often referred to by the placeholder ...
2,11000,78c0395b699c88308f6ccb421e14dd11,Hausa,26-40,Female,NG,mister oumar strongly supported sreejith's sug...,120,71896626cc2767f91501879acc38ddd3,/AfriSpeech-TTS-D/train/dbb2d034-3ebf-431a-992...,9.800,0.18,nigerian,general,train,False,196,Mr. Oumar strongly supported Sreejith's sugges...
3,6814,8834b9c6201b1afedbac3d297b91d211,Zulu,41-55,Female,ZA,onalenna was beheaded both kinteh and nur.,42,ce72a67d7c0457624607f1ad5af12682,/AfriSpeech-TTS-D/train/91ccb275-4994-4a03-970...,4.683,0.07,african,general,train,False,75,Onalenna was beheaded both Kinteh and Nur.
4,11444,95ed6e83a9e3c244b34e825c5ffe5864,Hausa,19-25,Male,NG,"omidina general oladeji, seun phirman, phil pl...",62,5ad0ab4cf145c1e9da214062307e3dde,/AfriSpeech-TTS-D/train/e2c8229f-3579-40c7-9ed...,7.918,0.11,nigerian,general,train,False,170,"Omidina Gen Oladeji, Seun Phirman, Phil Plait,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,34224,163047be637e8eb0071ac3d759671479,Nupe,19-25,Female,NG,"it features music by lenrie, lyrics by iregbu,...",70,4a734fd04104fd1afa4d9a0fd245b458,/AfriSpeech-TTS-D/train/adee5a25-05ec-489a-bd7...,9.116,0.00,nigerian,general,train,False,549,"It features music by Lenrie, lyrics by Iregbu,..."
642,16351,55da4a6bab34307f12a5222568871693,Hausa,19-25,Male,NG,yi participated in general onyechere cho's cou...,163,84e708ec752aaa366c91100f6ce870f1,/AfriSpeech-TTS-D/train/493df7cc-0c3c-4481-b7b...,18.464,0.33,nigerian,general,train,False,238,Yi participated in Gen Onyechere Cho's coup to...
643,28068,04d62d0186b620b6d2461e8587d2b1b2,Yoruba,19-25,Male,NG,"hill honorable olopade, engineer danchimah, mi...",53,16bfb61d5e77f404e6034861999e30e3,/AfriSpeech-TTS-D/train/3904d863-1150-4ee4-90f...,4.389,1.00,nigerian,general,train,False,473,"Hill Hon Olopade, Engr Danchimah, Mikel Tony, ..."
644,14097,55da4a6bab34307f12a5222568871693,Hausa,19-25,Male,NG,engineer mostafa hnu has ten avatars colon al ...,154,c80d704e2b21307005cfc1c8e254d64a,/AfriSpeech-TTS-D/train/498d539e-cca1-4c0e-b05...,21.726,0.33,nigerian,general,train,True,238,"Engr Mostafa hnu has 10 avatars: Al Asi fish),..."


In [43]:
del test_data_unseen["transcript_x"]
del train["transcript_x"]
del val["transcript_x"]
del test_data_seen["transcript_x"]

In [44]:
test_data_unseen.rename(columns={'transcript_y': 'transcript'}, inplace=True)

train.rename(columns={'transcript_y': 'transcript'}, inplace=True)
val.rename(columns={'transcript_y': 'transcript'}, inplace=True)
test_data_seen.rename(columns={'transcript_y': 'transcript'}, inplace=True)

In [45]:
test_data_unseen.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-test-unseen.csv", index=False)

train.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-train.csv", index=False)
val.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-dev.csv", index=False)

test_data_seen.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/afritts-test-seen.csv", index=False)

In [None]:
# data.to_csv("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/data/temp/intron-tts-all-public-0000", index=False)


In [44]:
with open("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/src/vits/filelists/afritts_train.txt", "w+") as f4:
    for i, item in train.iterrows():
        print(f"DUMMY4/{item.audio_paths[1:]}|{item.user_ids_num}|{item.transcript}", file=f4)
        

In [45]:
with open("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/src/vits/filelists/afritts_dev.txt", "w+") as f4:
    for i, item in val.iterrows():
        print(f"DUMMY4/{item.audio_paths[1:]}|{item.user_ids_num}|{item.transcript}", file=f4)

In [46]:
with open("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/src/vits/filelists/afritts_test_seen.txt", "w+") as f4:
    for i, item in test_data_seen.iterrows():
        print(f"DUMMY4/{item.audio_paths[1:]}|{item.user_ids_num}|{item.transcript}", file=f4)

In [47]:
train.user_ids_num.values.max()

746

In [48]:
train["user_ids_num"] = train["user_ids_num"].apply(lambda x: x+109)

with open("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/src/vits/filelists/afritts_vctk_train.txt", "w+") as f4:
    for i, item in train.iterrows():
        print(f"DUMMY4/{item.audio_paths[1:]}|{item.user_ids_num}|{item.transcript}", file=f4)

    A value is trying to be set on a copy of a slice from a DataFrame.
    Try using .loc[row_indexer,col_indexer] = value instead
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
      train["user_ids_num"] = train["user_ids_num"].apply(lambda x: x+109)
    


In [49]:
val["user_ids_num"] = val["user_ids_num"].apply(lambda x: x+109)

with open("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/src/vits/filelists/afritts_vctk_dev.txt", "w+") as f4:
    for i, item in val.iterrows():
        print(f"DUMMY4/{item.audio_paths[1:]}|{item.user_ids_num}|{item.transcript}", file=f4)

    A value is trying to be set on a copy of a slice from a DataFrame.
    Try using .loc[row_indexer,col_indexer] = value instead
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
      val["user_ids_num"] = val["user_ids_num"].apply(lambda x: x+109)
    


In [50]:
test_data_seen["user_ids_num"] = test_data_seen["user_ids_num"].apply(lambda x: x+109)

with open("/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/src/vits/filelists/afritts_vctk_test_seen.txt", "w+") as f4:
    for i, item in test_data_seen.iterrows():
        print(f"DUMMY4/{item.audio_paths[1:]}|{item.user_ids_num}|{item.transcript}", file=f4)
        

In [51]:
train.user_ids_num.values.max()

855

In [None]:
# select 1 sentence per accent per region for objective evaluation



In [None]:
# Test set: 10 speakers seen in training

In [None]:
speakers.tail(20)

In [None]:
import librosa
import IPython.display as ipd

In [None]:
data[0:1].audio_paths.values.item()

In [None]:
data[data.audio_paths == '/AfriSpeech-TTS/train/9b1a7865-4b74-486e-8223-9117e2ea592a/9b1f7d24a96824967ed3a1ae5d3c44cd_OWVB6wln.wav']

In [None]:
'/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/../AfriSpeech-TTS/train/9b1a7865-4b74-486e-8223-9117e2ea592a/9b1f7d24a96824967ed3a1ae5d3c44cd_OWVB6wln.wav'


In [None]:
filepath = '/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/afrispeech_16k/1e759a52b9a7588771a8d281dd480fce.wav'

audio, sr = librosa.load(filepath, sr=16000)

ipd.Audio(audio, rate=sr)

In [None]:
filepath = '/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/train/9b1a7865-4b74-486e-8223-9117e2ea592a/9b1f7d24a96824967ed3a1ae5d3c44cd_OWVB6wln.wav'

audio, sr = librosa.load(filepath, sr=16000)

ipd.Audio(audio, rate=sr)

In [None]:
audio.shape

In [None]:
sr

In [None]:
from matplotlib import pyplot as plt
from matplotlib.pyplot import imshow
import librosa.display

# parameters
n_fft=1024
hop_length=256
window="hann"
# sr=16000
n_mels=80
fmin=0
fmax=8000
win_length=n_fft
power=1.0
n_iter=64

def plot_spec(spec, aspect=2):
    if spec is not None:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        im = plt.imshow(spec, origin="lower",)
        cbar = plt.colorbar(im, ax=ax)
        cbar.set_label("Colorbar")
        ax.set_aspect(aspect)
        plt.show()

In [None]:
audio.shape

In [None]:
filepath = '/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/b53792d173f1ac743819e81f240a3a07_ooKIycx6.wav'

audio, sr = librosa.load(filepath, sr=16000)

ref_mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, S=None, n_fft=n_fft, 
                                          hop_length=hop_length, 
                                          win_length=win_length, 
                                          window=window, power=power,
                                          n_mels=n_mels, fmin=fmin, fmax=fmax,)
# vocoder expects a log mel-spectrogram
eps = 1e-9
ref_log_mel = np.log(ref_mel_spec + eps)

In [None]:
ipd.Audio(audio, rate=sr)

In [None]:
plot_spec(ref_log_mel,)

In [None]:
# denoised
filepath = '/srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/b53792d173f1ac743819e81f240a3a07_ooKIycx6_dn.wav'

audio_dn, sr = librosa.load(filepath)

dn_mel_spec = librosa.feature.melspectrogram(y=audio_dn, sr=sr, S=None, n_fft=n_fft, 
                                          hop_length=hop_length, 
                                          win_length=win_length, 
                                          window=window, power=power,
                                          n_mels=n_mels, fmin=fmin, fmax=fmax,)
# vocoder expects a log mel-spectrogram
eps = 1e-9
dn_log_mel = np.log(dn_mel_spec + eps)

In [None]:
ipd.Audio(audio, rate=sr)

In [None]:
plot_spec(dn_log_mel,2.5)