In [2]:
import os
import glob
import numpy as np
import argparse
from tqdm import tqdm

import torch
from torch.multiprocessing import Pool, set_start_method

from ttsxai.interfaces.prosody_interface import ProsodyInterface
from ttsxai.interfaces.tts_interface import get_text2mel, get_mel2wave
from ttsxai.articulatory_features import get_articulatory_features_for_phoneme


In [3]:
data_activation_dir = "/nas/users/dahye/kw/tts/ttsxai/data_activation/LJSpeech/tacotron2_waveglow"
npz_files = glob.glob(os.path.join(data_activation_dir, '*.npz'))

In [4]:
all_phonesymbols = []
all_articulatory_features = []
all_activations = []
for file in tqdm(npz_files):
    # print('only 10!!! for debugging')
    data_dict = np.load(file, allow_pickle=True)
    activations = data_dict['activations'].item()
    
    # TODO: remove it as we will save it explicitly
    phonesymbols = data_dict['phonesymbols']
    # articulatory_features = get_articulatory_features_for_phoneme(phonesymbols)
    articulatory_features = data_dict['articulatory_features']

    all_phonesymbols.append(phonesymbols)
    all_articulatory_features.append(articulatory_features)
    all_activations.append(activations)
    # break

100%|██████████| 13100/13100 [01:53<00:00, 115.48it/s]


In [5]:
def create_tensors(
    source_tokens,
    target_tokens,
    activations,
    task_specific_tag="Unknown",
    task_type="classification",
    binarized_tag=None,
    balance_data=False,
    dtype=None,
):
    """
    Method to pre-process loaded datasets into tensors that can be used to train
    probes and perform analyis on. The input tokens are represented as list of
    sentences, where each sentence is a list of tokens. Each token also has
    an associated label. All tokens from all sentences are flattened into one
    dimension in the returned tensors. The returned tensors will thus have
    ``total_num_tokens`` rows.

    Parameters
    ----------
    source_tokens: list of arrays
        List of sentences, where each sentence is a list of tokens. 
        e.g., [array(['M', 'ER0', 'IY1', ...], array(['JH', 'AH1', 'S', ...])
    target_tokens: list of arrays
        It depends on task_type. For example, in classification task, 
        articulatory_features can be used as targets.
        e.g., [['Bilabial', 'Vowel', 'Vowel', ...], [...]]
    activations : list of dictionary
        e.g., activations[0] has
        {'conv_0': ..., 'conv_1': ...}
        where each key has *sentence representations*, where each *sentence representation*
        is a numpy matrix of shape.
    task_specific_tag : str
        Label to assign tokens with unseen labels. This is particularly useful
        if some labels are never seen during train, but are present in the dev
        or test set. This is usually set to the majority class in the task.
    task_type : str
        Either "classification" or "regression", indicate the kind of task that
        is being probed.
    binarized_tag : str, optional
            Tag/Label to create binary data. All other labels in the dataset are changed
            to OTHER. Defaults to None in which case the data labels are processed as-is.
    balance_data : bool, optional
        Whether the incoming data should be balanced. Data is balanced using
        utils.balance_binary_class_data for binary data and utils.balance_multi_class_data
        for multi-class data using undersampling. Defaults to False.
    dtype : str, optional
        None if the dtype of the activation tensor should be the same dtype as in the activations input
        e.g. 'float16' or 'float32' to enforce half-precision or full-precision floats

    """
    assert (
        task_type == "classification" or task_type == "regression"
    ), "Invalid model type"
    num_tokens = count_tokens(source_tokens)
    print("Number of tokens: ", num_tokens)

    # Concatenate activations
    concatenated_activations = [np.concatenate(list(d.values()), axis=1) for d in activations]
    num_neurons = concatenated_activations[0].shape[1]

    if task_type == "classification":
        if binarized_tag:
            label2idx = {binarized_tag: 1, "OTHER": 0}
            idx2label = {1: binarized_tag, 0: "OTHER"}
        else:
            label2idx = tok2idx(target_tokens)
            idx2label = idx2tok(label2idx)

    src2idx = tok2idx(source_tokens)
    idx2src = idx2tok(src2idx)
    nidx2name = neuronidx2name(activations[0])

    print("length of source dictionary: ", len(src2idx))
    if task_type == "classification":
        print("length of target dictionary: ", len(label2idx))

    if dtype == None:
        dtype = concatenated_activations[0].dtype
    X = np.zeros((num_tokens, num_neurons), dtype=dtype)
    if task_type == "classification":
        y = np.zeros((num_tokens,), dtype=np.int)
    else:
        y = np.zeros((num_tokens,), dtype=np.float32)

    example_set = set()

    idx = 0
    for instance_idx, instance in enumerate(target_tokens):
        for token_idx, _ in enumerate(instance):
            if idx < num_tokens:
                X[idx] = concatenated_activations[instance_idx][token_idx, :]

            example_set.add(source_tokens[instance_idx][token_idx])
            if task_type == "classification":
                current_target_token = target_tokens[instance_idx][token_idx]
                if binarized_tag and current_target_token != binarized_tag:
                    current_target_token = "OTHER"
                y[idx] = label2idx[current_target_token]
            elif task_type == "regression":
                y[idx] = float(target_tokens[instance_idx][token_idx])

            idx += 1

    print(idx)
    print("Total instances: %d" % (num_tokens))
    print(list(example_set)[:20])

    print("Number of samples: ", X.shape[0])

    # if balance_data:
    #     print("Balancing data ... ")
    #     if binarized_tag:
    #         X, y = balance_binary_class_data(X, y)
    #     else:
    #         X, y = balance_multi_class_data(X, y)
    #     print("Number of samples after balancing: ", X.shape[0])

    labels, freqs = np.unique(y, return_counts=True)

    print("Stats: Labels with their frequencies in the final set")
    for idx, label in enumerate(labels):
        print(idx2label[label], freqs[idx])

    if task_type == "classification":
        return X, y, (label2idx, idx2label, src2idx, idx2src, nidx2name)
    return X, y, (src2idx, idx2src)


def count_tokens(source):
    """
    Utility function to count the total number of tokens in a dataset.
    """
    return sum([len(t) for t in source])


def tok2idx(tokens):
    """
    Utility function to generate unique indices for a set of tokens.
    """
    uniq_tokens = set().union(*tokens)
    return {p: idx for idx, p in enumerate(uniq_tokens)}


def idx2tok(srcidx):
    """
    Utility function to an inverse mapping from a ``tok2idx`` mapping.
    """
    return {v: k for k, v in srcidx.items()}


# Create the mapping for a single activation dictionary
def neuronidx2name(d):
    mapping = {}
    current_start_index = 0
    for layer_name, activations in d.items():
        # Calculate the end index for this activation
        end_index = current_start_index + activations.shape[1] - 1
        for i in range(current_start_index, end_index + 1):
            mapping[i] = f'{layer_name}__{i - current_start_index}'
        current_start_index = end_index + 1
    return mapping


X, y, mapping = create_tensors(
    all_phonesymbols,
    all_articulatory_features,
    all_activations,
    task_type='classification'
)

Number of tokens:  1136550
length of source dictionary:  80
length of target dictionary:  12


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


1136550
Total instances: 1136550
['AH1', 'UW0', 'EH1', ',', 'AW2', 'ER2', 'NG', 'IH2', 'N', 'AY0', 'AE2', 'AE0', 'AE1', 'DH', 'AA1', 'SH', 'OW2', 'HH', 'AO1', 'EY0']
Number of samples:  1136550
Stats: Labels with their frequencies in the final set
Vowel 349861
Labiodental 36608
Glottal 13814
Bilabial 59737
Space 209298
Dental 32913
Punctuation 27053
Alveolar 322942
Velar 41152
Post-Alveolar 18170
Labio-Velar 20573
Palatal 4429


In [6]:
X.shape

(1136550, 2048)

In [74]:
X.shape

(760, 2048)

In [46]:
# Concatenate activations and build the mapping
concatenated_arrays = []
mapping = []

for d in all_activations:
    current_concatenated = []
    current_start_index = 0
    for layer_name, activations in d.items():
        current_concatenated.append(activations)
        
        # Calculate the end index for this activation
        end_index = current_start_index + activations.shape[1] - 1
        for i in range(current_start_index, end_index + 1):
            mapping.append((layer_name, i - current_start_index))
        
        current_start_index = end_index + 1
    
    concatenated_arrays.append(np.concatenate(current_concatenated, axis=1))

result = np.vstack(concatenated_arrays)

# Using the mapping
index_to_check = 3
layer, neuron_index = mapping[index_to_check]
print(f"Neuron at index {index_to_check} came from layer '{layer}' and was neuron number {neuron_index}.")

Neuron at index 3 came from layer 'conv_0' and was neuron number 3.


In [67]:
# Create the mapping for a single activation dictionary
# def create_mapping(d):
def neuronidx2name(d):
    mapping = {}
    # mapping = []
    current_start_index = 0
    for layer_name, activations in d.items():
        # Calculate the end index for this activation
        end_index = current_start_index + activations.shape[1] - 1
        # print(layer_name, current_start_index, end_index)
        for i in range(current_start_index, end_index + 1):
            mapping[i] = f'{layer_name}__{i - current_start_index}'
            # mapping.append((layer_name, i - current_start_index))
        current_start_index = end_index + 1
    return mapping

mapping = neuronidx2name(all_activations[0])

# Concatenate activations
concatenated_arrays = [np.concatenate(list(d.values()), axis=1) for d in all_activations]
result = np.vstack(concatenated_arrays)

In [65]:
mapping[0]

('conv_0', 0)

In [54]:
mapping[2047]

('lstm', 511)

In [50]:
concatenated_arrays[0].shape

(76, 2048)

In [48]:
len(mapping)

20480

In [40]:
concatenated_arrays = np.array([np.concatenate(list(d.values()), axis=1) for d in all_activations])

  """Entry point for launching an IPython kernel.


In [45]:
concatenated_arrays[1].shape

(89, 2048)

In [37]:
all_activations is list of dictionary
e.g., activations[0] is dictionary with
  {'conv_0': array with shape (# tokens, # neurons of conv_0), 'conv_1': array with shape (# tokens, # neurons of conv_1)}
   where each key has *sentence representations*, where each *sentence representation*

I just want to change dictionary to array which concat all values so that activations would be just 2d array

[{'conv_0': array([[0.        , 0.        , 0.        , ..., 0.24294563, 0.305356  ,
          0.        ],
         [0.20261472, 0.        , 0.        , ..., 0.        , 0.        ,
          0.19406359],
         [0.        , 0.        , 0.36955297, ..., 0.        , 0.        ,
          0.18491985],
         ...,
         [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
          0.        ],
         [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
          0.07583992],
         [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
          0.        ]], dtype=float32),
  'conv_1': array([[0.        , 0.        , 0.02536896, ..., 0.        , 0.13122627,
          0.5711919 ],
         [0.        , 0.        , 0.3087739 , ..., 0.        , 0.25864828,
          0.        ],
         [0.        , 0.        , 0.        , ..., 0.        , 0.02876387,
          0.        ],
         ...,
         [0.        , 0.        , 0.        , ..., 0.

In [36]:
sum([activations[0]])

(67, 512)

In [None]:
activations[0].shape[1]

In [30]:
all_activations[0]

{'conv_0': array([[0.        , 0.        , 0.        , ..., 0.24294563, 0.305356  ,
         0.        ],
        [0.20261472, 0.        , 0.        , ..., 0.        , 0.        ,
         0.19406359],
        [0.        , 0.        , 0.36955297, ..., 0.        , 0.        ,
         0.18491985],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.07583992],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]], dtype=float32),
 'conv_1': array([[0.        , 0.        , 0.02536896, ..., 0.        , 0.13122627,
         0.5711919 ],
        [0.        , 0.        , 0.3087739 , ..., 0.        , 0.25864828,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.02876387,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,

In [28]:
count_target_words(all_articulatory_features)

760

In [26]:
phonesymbols

array(['SH', 'IY1', ' ', 'AO1', 'L', 'S', 'OW0', ' ', 'T', 'EH1', 'S',
       'T', 'IH0', 'F', 'AY2', 'D', ' ', 'DH', 'AE1', 'T', ' ', 'DH',
       'IH1', 'S', ' ', 'AH0', 'P', 'IH1', 'R', 'D', ' ', 'T', 'UW1', ' ',
       'B', 'IY1', ' ', 'DH', 'AH0', ' ', 'R', 'IH0', 'V', 'AA1', 'L',
       'V', 'ER0', ' ', 'S', 'IY1', 'N', ' ', 'IH0', 'N', ' ', 'AO1', 'Z',
       'W', 'AO0', 'L', 'D', 'Z', ' ', 'B', 'EH1', 'L', 'T'], dtype='<U3')

In [25]:
activations['conv_0'].shape

(67, 512)

In [15]:
source = phonesymbols
target = articulatory_features
activations

array({'conv_0': array([[0.        , 0.        , 0.        , ..., 0.24294563, 0.305356  ,
        0.        ],
       [0.20261472, 0.        , 0.        , ..., 0.        , 0.        ,
        0.19406359],
       [0.        , 0.        , 0.36955297, ..., 0.        , 0.        ,
        0.18491985],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.07583992],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32), 'conv_1': array([[0.        , 0.        , 0.02536896, ..., 0.        , 0.13122627,
        0.5711919 ],
       [0.        , 0.        , 0.3087739 , ..., 0.        , 0.25864828,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.02876387,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.   

In [14]:
phonesymbols

array(['M', 'ER0', 'IY1', 'N', 'AH0', ' ', 'AO1', 'Z', 'W', 'AO0', 'L',
       'D', ' ', 'G', 'EY1', 'V', ' ', 'AE1', 'Z', ' ', 'HH', 'ER1', ' ',
       'AH0', 'P', 'IH1', 'N', 'Y', 'AH0', 'N', ' ', 'DH', 'AE1', 'T',
       ' ', 'DH', 'AH0', ' ', 'M', 'EY1', 'L', 'ER0', 'D', 'ER0', ' ',
       'K', 'UW1', 'P', 'AO2', 'N', ' ', 'W', 'AA1', 'Z', ' ', 'IH0', 'N',
       ' ', 'AO1', 'Z', 'W', 'AO0', 'L', 'D', 'Z', ' ', 'HH', 'AE1', 'N',
       'D', 'R', 'AY2', 'T', 'IH0', 'NG', '.'], dtype='<U3')