# Testing bias

In [None]:
import pandas as pd
import librosa
import IPython.display as ipd

df = pd.read_csv('mp_styles_train.csv', sep=';')

In [None]:
df.head(3)

In [None]:
print(df['phonetic_transcription'].values[0])
ipd.Audio(df['wav_path'].values[0])

# Testing pitch change bias

In [None]:
n = -20
y, sr = librosa.load(df['wav_path'].values[n])
ipd.Audio(y, rate=sr)

In [None]:
ipd.Audio(y*10000.1, rate=sr)

In [None]:
# !pip install praat-parselmouth

In [None]:
import parselmouth
import numpy as np

snd = parselmouth.Sound(df['wav_path'].values[n])

In [None]:
pitch_steps: float = 0.01
pitch_floor: float = 75
pitch_ceil: float = 600

## Customize
formant_shift= 1.0
pitch_shift = 0.5
pitch_range = 1.
duration_factor = 1.

pitch = parselmouth.praat.call(
    snd, 'To Pitch', pitch_steps, pitch_floor, pitch_ceil)
ndpit = pitch.selected_array['frequency']
# if all unvoiced
nonzero = ndpit > 1e-5
# if nonzero.sum() == 0:
#     return snd.values[0]
# if voiced
median, minp = np.median(ndpit[nonzero]).item(), ndpit[nonzero].min().item()
# scale
updated = median * pitch_shift
scaled = updated + (minp * pitch_shift - updated) * pitch_range
# for preventing infinite loop of `Change gender`
# ref:https://github.com/praat/praat/issues/1926
if scaled < 0.:
    pitch_range = 1.
out, = parselmouth.praat.call(
    (snd, pitch), 'Change gender',
    formant_shift,
    median * pitch_shift,
    pitch_range,
    duration_factor).values

In [None]:
from IPython.display import Audio
Audio(out, rate = sr)

In [None]:
def compute_f0(wav_numpy, p_len=None, sampling_rate=44100,
    hop_length=512, voice_thresh = 0.3):
    import parselmouth
    x = wav_numpy
    if p_len is None:
        p_len = x.shape[0]//hop_length
    else:
        assert abs(p_len-x.shape[0]//hop_length) < 4, "pad length error"
    time_step = hop_length / sampling_rate * 1000
    f0_min = 50
    f0_max = 1100
    f0 = parselmouth.Sound(x, sampling_rate).to_pitch_cc(
        time_step=time_step / 1000, voicing_threshold=voice_thresh,
        pitch_floor=75, pitch_ceiling=1100).selected_array['frequency']

    pad_size=(p_len - len(f0) + 1) // 2
    if(pad_size>0 or p_len - len(f0) - pad_size>0):
        f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
    return f0

## Lets compute range of F0 from Rosana, and Adriana CAN x CPQD

In [None]:
f0_stats = df.copy()

means = []
stds = []
medians = []

for i in range(df.shape[0]):
    y, sr = librosa.load(df['wav_path'].values[i])
    mean = np.mean(compute_f0(y))
    std = np.std(compute_f0(y))
    median = np.median(compute_f0(y))
        
    means.append(mean)
    stds.append(std)
    medians.append(medians)
    
f0_stats['f0_mean'] = means
f0_stats['f0_std'] = stds
f0_stats['f0_median'] = medians

In [None]:
# Flag if data is from canada or CPQD
f0_stats['is_canada'] = "eps_" in f0_stats['wav_path']

In [None]:
import os
import pandas as pd

In [None]:
eps_paths = []

for f in os.listdir('/l/disk1/awstebas/data/TTS/speaker-adriana/'):
    if("eps_neutro" in f or "eps_animado" in f or "eps_rispido" in f or 'eps_acolhedor' in f):
        if(f[:4] != "sint"):
            eps_paths.append(f)

In [None]:
# The code above should result in this cell output. Maybe you should change the above code
eps_paths

In [None]:
data_p = '/l/disk1/awstebas/data/TTS/speaker-adriana/'

In [None]:
## CHECKING FOR WAV22 FOLDER IN EACH STYLE
c = True
t = True
for e in eps_paths:
    if('wav22' not in os.listdir(data_p + e)):
        print("There is no 'wav22' folder in ", e)
        c = False
    if('transcricao' not in os.listdir(data_p + e)):
        print("There is no 'transcricao' folder in ", e)
        t = False
        
if(c == True):
    print("All folders have wav22 file")

if(t == True):
    print("All folders have transcricao file")

In [None]:
## CHECKING TEXT STRUCTURE

ex = '/l/disk1/awstebas/data/TTS/speaker-adriana/eps_rispido_aco/transcricao/eps_rispido_aco.txt'

with open(ex, 'r') as f:
    for line in f.readlines():
        print(line.split(':'))
        break

In [None]:
## defining get_wav and text

styles = []
wavs = []
texts = []
for e in eps_paths:
    local_dir = data_p + e
    
    transc_path = local_dir + '/transcricao/' + e + '.txt'
    with open(transc_path, 'r') as f:
        for line in f.readlines():
            splitted = line.split(':')
            
            wpath = local_dir + '/wav22/' + splitted[0] + '.wav'
           
            if(os.path.isfile(wpath)):
                wavs.append(wpath)

                texts.append(splitted[1][1:-1])

                style = e.split('_')[1]
                styles.append(style)
            else:
                print(f'{wpath} is not a file')

In [None]:
len(texts)

In [None]:
def get_phones(txt):
    
    with open('tmp.txt', "w", encoding="utf-8") as f:
        f.writelines(txt)
    
    cmd = '/workspace/tool_language/tool_language -l /workspace/tool_language/libptbr.so.4.6.0 -p /workspace/tool_language/ -i tmp.txt -o tmp_pnh.txt --phonemes -s'
    
    os.system(cmd)
    
    with open ('tmp_pnh.txt', "r") as f:
        lines = f.readlines()
    
    return lines[0][:-1]

In [None]:
get_phones(texts[0])

In [None]:
phones = []

for t in texts:
    phones.append(get_phones(t))

In [None]:
speakers = ['adriana'] * len(phones)
df = pd.DataFrame({'norm_text': texts,
                   'phonetic_transcription': phones,
                   'wav_path': wavs,
                   'style': styles,
                   'speaker': speakers})

df.head(), df.shape

In [None]:
df['phonetic_transcription'].values[:3]

# Now lets particionate in train, val and test...

Where, test set will be 50% paired samples among all styles and the other 50% will be non-paired samples

In [None]:
df['style'].value_counts()

In [None]:
# Lets get a flag in which we have same phonemes

gp = df.groupby('phonetic_transcription').count().reset_index().sort_values(by='speaker', ascending= False)
gp['len'] = gp.phonetic_transcription.str.len()
gp.head(30)

In [None]:
df[df['phonetic_transcription'] == gp.phonetic_transcription.values[2]]

In [None]:
import IPython
f =df[df['phonetic_transcription'] == gp.phonetic_transcription.values[2]]['wav_path'].values[-1]
IPython.display.Audio(f)

In [None]:
import IPython
f =df[df['phonetic_transcription'] == gp.phonetic_transcription.values[0]]['wav_path'].values[4]
IPython.display.Audio(f)

In [None]:
50/5180

# Lets get 20 paired and 30 unpaired samples

Because, in 20 paired samples we have at least 80 audio samples (1 for each 4 styles_), while in 30 unpaired samples we have only 30 audio samples.

In [None]:
# Lets get a flag in which we have same phonemes

gp = df.groupby('phonetic_transcription').count().reset_index().sort_values(by='speaker', ascending= False)
gp['len'] = gp.phonetic_transcription.str.len()

paired = gp.head(20).phonetic_transcription.values
paired

In [None]:
# Lets get a flag in which we have same phonemes

gp = df.groupby('phonetic_transcription').count().reset_index().sort_values(by='speaker', ascending= True)
gp['len'] = gp.phonetic_transcription.str.len()

nonpaired = gp[gp['speaker'] == 1].phonetic_transcription.values
len(nonpaired)

In [None]:
npfilt = df[df.phonetic_transcription.isin(nonpaired)]

In [None]:
nonpaired_neutral = npfilt[npfilt['style'] == 'neutro'].sample(5, random_state = 42).phonetic_transcription.values
nonpaired_neutral

In [None]:
nonpaired_animado = npfilt[npfilt['style'] == 'animado'].sample(5, random_state = 42).phonetic_transcription.values
nonpaired_animado

In [None]:
nonpaired_acolhedor = npfilt[npfilt['style'] == 'acolhedor'].sample(5, random_state = 42).phonetic_transcription.values
nonpaired_acolhedor

In [None]:
nonpaired_rispido = npfilt[npfilt['style'] == 'rispido'].sample(5, random_state = 42).phonetic_transcription.values
nonpaired_rispido

In [None]:
nonpaired_neutral = list(nonpaired_neutral)
nonpaired_animado = list(nonpaired_animado)
nonpaired_acolhedor = list(nonpaired_acolhedor)
nonpaired_rispido = list(nonpaired_rispido)

nonpaired = []
nonpaired.extend(nonpaired_neutral)
nonpaired.extend(nonpaired_animado)
nonpaired.extend(nonpaired_rispido)
nonpaired.extend(nonpaired_acolhedor)

paired = list(paired)
len(nonpaired), len(paired)

In [None]:
# Lets just hear some examples if its all right
import IPython
f =df[df['phonetic_transcription'] == nonpaired_neutral[4]]['wav_path'].values[0]
IPython.display.Audio(f)

In [None]:
# Lets just hear some examples if its all right
import IPython
f =df[df['phonetic_transcription'] == nonpaired_animado[0]]['wav_path'].values[0]
IPython.display.Audio(f)

In [None]:
# Lets just hear some examples if its all right
import IPython
f =df[df['phonetic_transcription'] == nonpaired_rispido[0]]['wav_path'].values[0]
IPython.display.Audio(f)

In [None]:
# Lets just hear some examples if its all right
import IPython
f =df[df['phonetic_transcription'] == nonpaired_acolhedor[0]]['wav_path'].values[0]
IPython.display.Audio(f)

Aparentemente tudo certo, vamos agora splitar o conjuntos de teste inteiro


In [None]:
df_test_paired = df[df.phonetic_transcription.isin(paired)]
df_test_paired.shape

In [None]:
df_test_paired.value_counts('style')

In [None]:
df_test_unpaired = df[df.phonetic_transcription.isin(nonpaired)]
df_test_unpaired.shape

In [None]:
# Finally, lets save files as meta_data format for cpqd_read, which use ';' as separator

cols = ['phonetic_transcription', 'wav_path', 'speaker','style']

df_test_paired[cols].to_csv('styles_paired_test.csv', index = False, sep=';', encoding = 'utf-8')
df_test_unpaired[cols].to_csv('styles_unpaired_test.csv', index = False, sep=';', encoding = 'utf-8')

# Now lets define our "rest" dataset and split in train and val randomly

In [None]:
total_to_exclude = []
total_to_exclude.extend(nonpaired)
total_to_exclude.extend(paired)

df_rest = df[~df.phonetic_transcription.isin(total_to_exclude)]
df_rest.shape, df.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_val = train_test_split(df_rest, test_size = 0.03, stratify=df_rest['style'], random_state = 42)

In [None]:
import librosa

def get_durations(wpath):
    y, sr = librosa.load(wpath, sr = None)
    return len(y)/sr

In [None]:
durs = []
for w in df_train['wav_path']:
    durs.append(get_durations(w))
df_train['durs'] = durs

durs = []
for w in df_val['wav_path']:
    durs.append(get_durations(w))
df_val['durs'] = durs

In [None]:
df_train.durs.sum()/3600, df_val.durs.sum()/3600

In [None]:
# df_train per speaker
df_train.groupby('style').agg({'durs': 'sum'})/3600

In [None]:
# df_val per speaker
df_val.groupby('style').agg({'durs': 'sum'})/3600

# Now, getting the train and val dataset from universal list and getting only 3 speakers data

In [None]:
u_train = pd.read_csv("universal_cpqd_train.csv", encoding= 'utf-8', sep=';')
u_val = pd.read_csv("universal_cpqd_val.csv", encoding= 'utf-8', sep=';')

In [None]:
u_train['style'] = 'neutro'
u_val['style'] = 'neutro'

speakers_in = ['adriana','chiquinho','rosana']

u_train = u_train[u_train['speaker'].isin(speakers_in)]
u_val = u_val[u_val['speaker'].isin(speakers_in)]

u_train.speaker.unique(), u_val.speaker.unique()

In [None]:
def get_rec(wpath):
    return wpath.split('/')[7]

recs = []
for w in u_train.wav_path:
    recs.append(get_rec(w))
u_train['rec'] = recs

recs = []
for w in u_val.wav_path:
    recs.append(get_rec(w))
u_val['rec'] = recs

In [None]:
u_train.rec.unique(), u_val.rec.unique()

In [None]:
u_train.shape , u_train[u_train.rec.isin(['rf_selecionadas'])].shape

In [None]:
u_train[(u_train.rec.isin(['riqueza_fonetica']))].speaker.value_counts()

In [None]:
u_train[(u_train['speaker'].isin(['rosana','chiquinho'])) | ((u_train['speaker'] == 'adriana') & (u_train['rec'] == 'riqueza_fonetica'))].speaker.value_counts()

In [None]:
u_train = u_train[(u_train['speaker'].isin(['rosana','chiquinho'])) | ((u_train['speaker'] == 'adriana') & (u_train['rec'] == 'riqueza_fonetica'))]

In [None]:
u_val.speaker.value_counts()

In [None]:
## Getting only the necessary cols
cols = ['phonetic_transcription', 'wav_path', 'speaker','style']

u_train = u_train[cols]
u_val = u_val[cols]

df_train = df_train[cols]
df_val = df_val[cols]

df_train_tot = pd.concat([df_train, u_train]).reset_index(drop = True)
df_val_tot = pd.concat([df_val, u_val]).reset_index(drop = True)

df_train_tot.shape, df_val_tot.shape

In [None]:
df_train_tot.head()

In [None]:
df_train_tot.speaker.value_counts()

In [None]:
df_val_tot.speaker.value_counts()

In [None]:
durs = []
for w in df_train_tot['wav_path']:
    durs.append(get_durations(w))
df_train_tot['durs'] = durs

durs = []
for w in df_val_tot['wav_path']:
    durs.append(get_durations(w))
df_val_tot['durs'] = durs

In [None]:
df_train_tot.durs.sum()/3600, df_val_tot.durs.sum()/3600

In [None]:
# df_train per speaker
df_train_tot.groupby('style').agg({'durs': 'sum'})/3600

In [None]:
# df_train per speaker
df_train_tot.groupby('speaker').agg({'durs': 'sum'})/3600

In [None]:
cols = ['phonetic_transcription', 'wav_path', 'speaker','style']

df_train_tot[cols].to_csv('mp_styles_train.csv', index = False, sep=';', encoding = 'utf-8')
df_val_tot[cols].to_csv('mp_styles_val.csv', index = False, sep=';', encoding = 'utf-8')