In [1]:
import os
import numpy as np
from tqdm import tqdm

# import neurox
import neurox.interpretation.utils as utils

from ttsxai.utils.utils import read_ljs_metadata

In [2]:
data_activation_dir = "/nas/users/dahye/kw/tts/ttsxai/data_activation/LJSpeech/tacotron2_waveglow"

# Dictionary keys to filter
keys_to_filter = read_ljs_metadata(mode='test')

# List to store filtered paths
npz_files = []

# Iterate over all files in the directory
for file in os.listdir(data_activation_dir):
    # Check only for .npz files
    if file.endswith('.npz'):
        # Extract the identifier part from the file name (e.g., 'LJ037-0213')
        identifier = file.split('.')[0]

        # If this identifier is included in the dictionary keys, add to the list
        if identifier in keys_to_filter:
            full_path = os.path.join(data_activation_dir, file)
            npz_files.append(full_path)

In [3]:
tokens = {'source': [], 'target': []}
dict_activations = []
# all_phonesymbols = []
# all_articulatory_features = []
# all_activations = []
for file in tqdm(npz_files):
    data_dict = np.load(file, allow_pickle=True)
    # activations = data_dict['activations'].item()
    
    # phonesymbols = data_dict['phonesymbols']
    # articulatory_features = data_dict['articulatory_features']

    tokens['source'].append(list(data_dict['phonesymbols']))
    tokens['target'].append(list(data_dict['articulatory_features']))
    dict_activations.append(data_dict['activations'].item())


    # all_phonesymbols.append(phonesymbols)
    # all_articulatory_features.append(articulatory_features)
    # all_activations.append(activations)

100%|██████████| 500/500 [00:03<00:00, 160.13it/s]


In [9]:
tokens['target']

[['Dental',
  'Vowel',
  'Space',
  'Labiodental',
  'Vowel',
  'Alveolar',
  'Bilabial',
  'Alveolar',
  'Space',
  'Vowel',
  'Labiodental',
  'Space',
  'Bilabial',
  'Alveolar',
  'Vowel',
  'Alveolar',
  'Alveolar',
  'Vowel',
  'Alveolar',
  'Space',
  'Alveolar',
  'Vowel',
  'Alveolar',
  'Vowel',
  'Alveolar',
  'Space',
  'Post-Alveolar',
  'Vowel',
  'Alveolar',
  'Space',
  'Bilabial',
  'Vowel',
  'Space',
  'Bilabial',
  'Palatal',
  'Vowel',
  'Alveolar',
  'Vowel',
  'Labiodental',
  'Vowel',
  'Alveolar',
  'Punctuation',
  'Space',
  'Vowel',
  'Alveolar',
  'Alveolar',
  'Space',
  'Dental',
  'Vowel',
  'Alveolar',
  'Space',
  'Dental',
  'Vowel',
  'Alveolar',
  'Space',
  'Vowel',
  'Vowel',
  'Alveolar',
  'Post-Alveolar',
  'Bilabial',
  'Vowel',
  'Alveolar',
  'Alveolar',
  'Space',
  'Vowel',
  'Alveolar',
  'Space',
  'Dental',
  'Vowel',
  'Space',
  'Bilabial',
  'Vowel',
  'Post-Alveolar',
  'Space',
  'Post-Alveolar',
  'Vowel',
  'Alveolar',
  'Space',

In [4]:
# nidx2name = neuronidx2name(activations[0])

# Concatenate activations
activations = [np.concatenate(list(d.values()), axis=1) for d in dict_activations]

In [7]:
X, y, mapping = utils.create_tensors(tokens, activations, 'NN')
label2idx, idx2label, src2idx, idx2src = mapping

neuronidx2name = {}
current_start_index = 0
for layer_name, activations in dict_activations[0].items():
    # Calculate the end index for this activation
    end_index = current_start_index + activations.shape[1] - 1
    for i in range(current_start_index, end_index + 1):
        neuronidx2name[i] = f'{layer_name}__{i - current_start_index}'
    current_start_index = end_index + 1


Number of tokens:  43250
length of source dictionary:  78
length of target dictionary:  12
43250
Total instances: 43250
['IY2', 'Z', 'N', 'OY1', 'ZH', 'AY0', 'EY0', 'EH0', 'OW1', 'UW0', 'SH', 'AH1', 'T', 'UH0', 'EH1', 'EY1', 'AE2', 'G', 'AH0', 'AA0']
Number of samples:  43250
Stats: Labels with their frequencies in the final set
Glottal 527
Space 7989
Labiodental 1411
Dental 1302
Punctuation 1016
Alveolar 12338
Bilabial 2225
Post-Alveolar 660
Palatal 184
Labio-Velar 790
Velar 1503
Vowel 13305


In [6]:
mapping

({'Glottal': 0,
  'Labiodental': 1,
  'Alveolar': 2,
  'Labio-Velar': 3,
  'Punctuation': 4,
  'Space': 5,
  'Dental': 6,
  'Vowel': 7,
  'Bilabial': 8,
  'Palatal': 9,
  'Velar': 10,
  'Post-Alveolar': 11},
 {0: 'Glottal',
  1: 'Labiodental',
  2: 'Alveolar',
  3: 'Labio-Velar',
  4: 'Punctuation',
  5: 'Space',
  6: 'Dental',
  7: 'Vowel',
  8: 'Bilabial',
  9: 'Palatal',
  10: 'Velar',
  11: 'Post-Alveolar'},
 {'P': 0,
  '(': 1,
  'EH1': 2,
  'AW2': 3,
  'JH': 4,
  'K': 5,
  'AH0': 6,
  'W': 7,
  'EH2': 8,
  'EY1': 9,
  'ZH': 10,
  ')': 11,
  'OW2': 12,
  ' ': 13,
  'IH1': 14,
  'AH1': 15,
  'IY2': 16,
  'IH2': 17,
  'IY0': 18,
  '?': 19,
  'IY1': 20,
  'AY0': 21,
  'AE2': 22,
  ',': 23,
  'AO0': 24,
  'EY0': 25,
  'ER0': 26,
  'AY2': 27,
  'S': 28,
  'R': 29,
  'UH2': 30,
  'D': 31,
  'HH': 32,
  'ER1': 33,
  'EH0': 34,
  'AH2': 35,
  'OW1': 36,
  '.': 37,
  'OY1': 38,
  "'": 39,
  'AW1': 40,
  'N': 41,
  'UH1': 42,
  ';': 43,
  'AW0': 44,
  'EY2': 45,
  'AA0': 46,
  'DH': 47,
  'S

In [32]:
activations

[array([[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          1.5101662e-01,  1.3451192e-04,  5.8807361e-01],
        [ 6.0087655e-02,  0.0000000e+00,  0.0000000e+00, ...,
          1.2817810e-01,  3.0887148e-01,  4.7346628e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          1.9432725e-01,  2.9199934e-01,  1.0133484e-01],
        ...,
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          3.9768615e-01,  1.6183079e-05,  1.1599790e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          5.2805424e-01,  4.3038610e-05, -7.7314451e-02],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          5.8202129e-02,  5.8932543e-01, -6.3348964e-02]], dtype=float32),
 array([[ 0.        ,  0.        ,  0.        , ..., -0.56255037,
          0.00309492, -0.2036153 ],
        [ 0.        ,  0.        ,  0.        , ..., -0.69720167,
          0.05770411, -0.04321938],
        [ 0.        ,  0.        ,  0.        