In [8]:
import os
import textgrid
from pathlib import Path
from collections import Counter
import numpy as np
from pymcd.mcd import Calculate_MCD
import librosa
import pysptk
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import math
import pandas as pd

In [9]:
def get_word_alignments(
    textgrid_path,
    utt_dur_from_last_word=False,
    ignore_list=('<unk>',),
):
    """
    extract word alignments from textgrid file corresponding to one utterance

    utt_dur_from_last_word: whether to set utt_dur to end timestamp of  last real wordtype, or from
    the very last alignment in the utterance (likely corresponding to silence)
    """
    tg = textgrid.TextGrid.fromFile(textgrid_path)
    words_intervaltier, _phones_intervaltier = tg
    words = []
    counter = Counter()

    for word in words_intervaltier:
        if word.mark and word.mark not in ignore_list:  # if word.mark is False then it is SILENCE
            counter[word.mark] += 1
            words.append({
                "wordtype": word.mark,
                "utt_id": textgrid_path.split('/')[-1].split('.')[0],
                "example_no": counter[word.mark],  # the number of times we have seen this word in this utterance
                "start": word.minTime,
                "end": word.maxTime,
            })

    if utt_dur_from_last_word:
        # use last real word end time as the utt_dur
        utt_dur = words[-1]['end']
    else:
        # at this point word is the last item in words_intervaltier (most likely sil / None)
        utt_dur = word.maxTime

    # add utt_dur info to all words
    for w in words:
        w["utt_dur"] = utt_dur

    return words

def get_wordlevel_reprs(speechreps, word_align):
    """
    extract subsequence of 'repr' that corresponds to a particular word
    function expects input to be of dimension 2: (timesteps, hidden_size)
    """
    start_fraction = word_align['start'] / word_align['utt_dur']
    end_fraction = word_align['end'] / word_align['utt_dur']
    timesteps = len(speechreps)
    start_idx = round(start_fraction * timesteps)
    end_idx = round(end_fraction * timesteps)
    return speechreps[start_idx:end_idx]

def average_mcd(ref_mcep_vec, syn_mcep_vec, MCD_mode):
    """
    Calculate the average MCD.
    :param ref_mcep_files: list of strings, paths to MCEP target reference files
    :param synth_mcep_files: list of strings, paths to MCEP converted synthesised files
    :param cost_function: distance metric used
    :param plain: if plain=True, use Dynamic Time Warping (dtw)
    :returns: average MCD, total frames processed
    """
    log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) # 6.141851463713754
    
    def log_spec_dB_dist(x, y):
        # log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
        diff = x - y
        return self.log_spec_dB_const * math.sqrt(np.inner(diff, diff))
    
    def calculate_mcd_distance(x, y, path):
        '''
        param path: pairs between x and y
        '''
        pathx = list(map(lambda l: l[0], path))
        pathy = list(map(lambda l: l[1], path))
        x, y = x[pathx], y[pathy]
        frames_tot = x.shape[0]       # length of pairs

        z = x - y
        min_cost_tot = np.sqrt((z * z).sum(-1)).sum()

        return frames_tot, min_cost_tot
    
    if MCD_mode == "plain":
        # print("Calculate plain MCD ...")
        path = []
        # for i in range(num_temp):
        for i in range(len(ref_mcep_vec)):
            path.append((i, i))
    elif MCD_mode == "dtw":
        # print("Calculate MCD-dtw ...")
        _, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)
    elif MCD_mode == "dtw_sl":
        # print("Calculate MCD-dtw-sl ...")
        cof = len(ref_mcep_vec)/len(syn_mcep_vec) if len(ref_mcep_vec)>len(syn_mcep_vec) else len(syn_mcep_vec)/len(ref_mcep_vec)
        _, path = fastdtw(ref_mcep_vec[:, 1:], syn_mcep_vec[:, 1:], dist=euclidean)

    frames_tot, min_cost_tot = calculate_mcd_distance(ref_mcep_vec, syn_mcep_vec, path)

    if MCD_mode == "dtw_sl":
        mean_mcd = cof * log_spec_dB_const * min_cost_tot / frames_tot
    else:
        mean_mcd = log_spec_dB_const * min_cost_tot / frames_tot

    return mean_mcd

In [10]:
melspec_dir = Path('/home/s1785140/data/VCTK_fairseq/feature_manifest/logmels')
align_dir = Path('/home/s1785140/data/vctk_montreal_alignments_from_trimmed_wavs')
n_mfcc = 13
speakers = ['p225', 'p226', 'p227', 'p228', 'p229']
ref_speaker = 'p225'
words_to_compare = [ 
    'physical',
    'physically',
    'physicists',
    'concern',
    'concerned',
    'concert',
    'change',
    'changed',
    'changes',
]
ref_word = 'change'
# ref_word = 'physicists'

In [11]:
def get_word2melspec(speaker):
    uttid2wordaligns = {}

    for textgrid_file in os.listdir(align_dir/speaker):
        uttid = textgrid_file.split('.')[0]
        textgrid_path = align_dir/speaker/textgrid_file
        uttid2wordaligns[uttid] = get_word_alignments(str(textgrid_path))
        
    #get mel specs for all words we wish to compare distances between
    word2melspec = {}
    for uttid, wordaligns in uttid2wordaligns.items():
        melspec = None
        for wordalign in wordaligns:        
            wordtype = wordalign['wordtype']
            if wordtype in words_to_compare:
                if melspec is None: # only compute melspec for words we wish to compare + only do this once
                    melspec = np.load(str(melspec_dir/uttid) + '.npy')
                wordalign['melspec'] = get_wordlevel_reprs(melspec, wordalign)
                example_no = wordalign['example_no']
                unique_id = uttid + '|' + str(example_no)

                if wordtype not in word2melspec:
                    word2melspec[wordtype] = {}
                word2melspec[wordtype][unique_id] = wordalign['melspec']

    return word2melspec

# get word2melspec for each speaker
speaker2word2melspec = {}
for speaker in speakers:
    speaker2word2melspec[speaker] = get_word2melspec(speaker)

In [12]:
# get melspec for the reference word/speaker
wordtoken_idx = 0 # just choose the first example of the ref word 
ref_word_uniqueid, ref_word_melspec = list(speaker2word2melspec[ref_speaker][ref_word].items())[wordtoken_idx]
ref_word_melspec.shape

(36, 80)

In [13]:
# create pandas dataframe containing distance from each mel spec to a ref mel spec
df = pd.DataFrame(columns=[
    'ref_word',
    # 'ref_speaker',
    # 'speaker',
    'ref_uniqueid',
    'uniqueid',
    'word',
    'melspec_dtw',
    'melspec_dtw_sl',
    'mfcc_dtw',
    'mfcc_dtw_sl',
    'ref_word_frames',
    'word_frames',
])

for speaker in speakers:
    for otherword in words_to_compare:
        word2melspec = speaker2word2melspec[speaker]
        if otherword in word2melspec:
            for wordtoken_uniqueid, melspec in word2melspec[otherword].items():
                # get distance between ref word + other word
                ref_word_melcepstrum = librosa.feature.mfcc(S=ref_word_melspec.T, n_mfcc=n_mfcc).T
                otherword_melcepstrum = librosa.feature.mfcc(S=melspec.T, n_mfcc=n_mfcc).T

                # print(f'{ref_word_melspec.shape=}, {ref_word_melcepstrum.shape=}')
                # print(f'{melspec.shape=}, {otherword_melcepstrum.shape=}')
                
                df.loc[len(df.index)] = {
                    'ref_word': ref_word,
                    'word': otherword,
                    # 'ref_speaker': ref_speaker,
                    # 'speaker': speaker,
                    'ref_uniqueid': ref_word_uniqueid,
                    'uniqueid': wordtoken_uniqueid, 
                    'melspec_dtw': average_mcd(ref_word_melspec, melspec, MCD_mode='dtw'),
                    'melspec_dtw_sl': average_mcd(ref_word_melspec, melspec, MCD_mode='dtw_sl'),
                    'mfcc_dtw': average_mcd(ref_word_melcepstrum, otherword_melcepstrum, MCD_mode='dtw'),
                    'mfcc_dtw_sl': average_mcd(ref_word_melcepstrum, otherword_melcepstrum, MCD_mode='dtw_sl'),
                    'ref_word_frames': ref_word_melspec.shape[0],
                    'word_frames': melspec.shape[0],
                }
        
# add new column
df.insert(len(df.columns),'len_ratio', df['word_frames'] / df['ref_word_frames'])

In [14]:
# df.sort_values(by=['word', 'melspec_dtw'])
df.sort_values(by=['melspec_dtw'])

Unnamed: 0,ref_word,ref_uniqueid,uniqueid,word,melspec_dtw,melspec_dtw_sl,mfcc_dtw,mfcc_dtw_sl,ref_word_frames,word_frames,len_ratio
6,change,p225_045|1,p225_045|1,change,0.0,0.0,0.0,0.0,36,36,1.0
21,change,p225_045|1,p226_339|1,changed,57.999467,157.887438,186.016271,506.377626,36,98,2.722222
1,change,p225_045|1,p225_017|1,physically,72.03939,140.076593,142.799728,277.666138,36,70,1.944444
41,change,p225_045|1,p228_313|1,change,73.201076,146.402152,166.764631,333.529263,36,72,2.0
10,change,p225_045|1,p225_175|1,changed,75.219053,114.917997,74.099442,113.207481,36,55,1.527778
20,change,p225_045|1,p226_190|1,changed,78.366896,158.91065,103.96838,210.82477,36,73,2.027778
50,change,p225_045|1,p229_199|1,changed,79.427203,132.378671,72.937024,121.561707,36,60,1.666667
4,change,p225_045|1,p225_355|1,concerned,80.642439,141.124267,107.8981,188.821676,36,63,1.75
45,change,p225_045|1,p229_017|1,physically,82.279212,166.843958,76.562528,155.251792,36,73,2.027778
23,change,p225_045|1,p227_013|1,physical,85.55989,161.613126,93.03856,175.739502,36,68,1.888889


In [15]:
# df.sort_values(by=['word', 'mfcc_dtw'])
df.sort_values(by=['mfcc_dtw'])

Unnamed: 0,ref_word,ref_uniqueid,uniqueid,word,melspec_dtw,melspec_dtw_sl,mfcc_dtw,mfcc_dtw_sl,ref_word_frames,word_frames,len_ratio
6,change,p225_045|1,p225_045|1,change,0.0,0.0,0.0,0.0,36,36,1.0
50,change,p225_045|1,p229_199|1,changed,79.427203,132.378671,72.937024,121.561707,36,60,1.666667
10,change,p225_045|1,p225_175|1,changed,75.219053,114.917997,74.099442,113.207481,36,55,1.527778
42,change,p225_045|1,p228_040|1,changed,89.671383,146.961434,74.594736,122.252484,36,59,1.638889
45,change,p225_045|1,p229_017|1,physically,82.279212,166.843958,76.562528,155.251792,36,73,2.027778
32,change,p225_045|1,p227_118|1,changed,87.24771,189.036705,80.093067,173.534979,36,78,2.166667
27,change,p225_045|1,p227_101|1,change,94.630404,123.54525,80.896633,105.615048,36,47,1.305556
11,change,p225_045|1,p225_366|1,changes,91.571436,155.162712,81.580191,138.233102,36,61,1.694444
40,change,p225_045|1,p228_280|1,change,93.631121,153.451004,81.928683,134.272008,36,59,1.638889
14,change,p225_045|1,p226_019|1,physicists,89.905771,129.863892,82.013994,118.464658,36,52,1.444444


In [21]:
# Get distances for hubert codes
import pickle
p = '/home/s1785140/data/word2speechreps/vctk_train_km100_word2speechreps.pickle'
with open(p, 'rb') as f:
    (word2speechreps, ids2word_alignments) = pickle.load(f)

In [30]:
num_to_print = 5
for word, uniqueid2speechcodes in list(word2speechreps.items())[:num_to_print]:
    print('\n', word, ':', list(uniqueid2speechcodes.items())[:2])


 ask : [('p225_002|1', [71, 24, 24, 24, 24, 61, 61, 43, 43, 6, 15, 15, 92, 92, 27, 89]), ('p225_191|1', [35, 24, 24, 51, 51, 19, 19, 65, 6, 15, 15, 92, 92, 89, 89])]

 her : [('p225_002|1', [59, 33, 33, 87, 9, 38, 9, 9, 9]), ('p225_002|2', [75, 33, 87, 91, 91, 91, 91, 43, 43, 43, 65])]

 to : [('p225_002|1', [65, 74, 27, 31, 23, 23, 23]), ('p225_009|1', [18, 31, 31, 23, 23, 23])]

 bring : [('p225_002|1', [66, 27, 47, 25, 25, 88, 88, 88, 18]), ('p226_002|1', [66, 63, 47, 52, 52, 25, 25, 25, 88, 88, 88, 88, 88, 18, 18])]

 these : [('p225_002|1', [18, 82, 11, 11, 64, 65, 6, 15, 15]), ('p225_005|1', [3, 82, 11, 45, 64, 65, 6, 15])]


In [32]:
# get all words for a single speaker
speaker = 'p225'
p225_word2speechreps = {}
for word, uniqueid2speechcodes in word2speechreps.items():
    new_uniqueid2speechcodes = {}
    for uniqueid, speechcodes in uniqueid2speechcodes.items():
        if speaker == uniqueid.split('_')[0]:
            new_uniqueid2speechcodes[uniqueid] = speechcodes
    if new_uniqueid2speechcodes:
        p225_word2speechreps[word] = new_uniqueid2speechcodes

# get k means models
```bash
cd ~/fairseq/examples/textless_nlp/gslm/speech2unit/pretrained_models/hubert/
wget https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/km100/km.bin
mv km.bin km100.bin
wget https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/km50/km.bin
mv km.bin km50.bin
wget https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/km200/km.bin
mv km.bin km200.bin
```

In [34]:
# get cont. vectors for seq of discrete codes
import joblib
kmeans_model_path = '~/fairseq/examples/textless_nlp/gslm/speech2unit/pretrained_models/hubert/km100.bin'
kmeans_model = joblib.load(open(kmeans_model_path, "rb"))
kmeans_model.verbose = False
pred = kmeans_model.predict(feats)

FileNotFoundError: [Errno 2] No such file or directory: '~/fairseq/examples/textless_nlp/gslm/speech2unit/pretrained_models/hubert/km100.bin'