In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [42]:

from dataclasses import replace
import itertools
from pathlib import Path
import pickle

import datasets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.spatial import distance
import seaborn as sns
from sklearn.decomposition import PCA
from tqdm.auto import tqdm

from src.analysis.state_space import prepare_state_trajectory, StateSpaceAnalysisSpec
from src.datasets.speech_equivalence import SpeechEquivalenceDataset
from src.models import get_best_checkpoint
from src.models.integrator import ContrastiveEmbeddingModel, compute_embeddings, load_or_compute_embeddings

In [13]:
model_dir = "outputs/models/w2v2_8/syllable"
equiv_dataset_path = "data/timit_equiv_phoneme_within_word_prefix_1.pkl"
# equiv_dataset_path = "data/timit_equiv_phoneme_6_1.pkl"
output_dir = "."

state_space_spec_path = "out/state_space_specs/all_syllables.pkl"

metric = "cosine"

In [7]:
model = ContrastiveEmbeddingModel.from_pretrained(get_best_checkpoint(model_dir))
model.eval()

  return self.fget.__get__(instance, owner)()


ContrastiveEmbeddingModel(
  (rnn): RNNModel(
    (rnn): LSTM(768, 32, batch_first=True)
    (fc): Linear(in_features=32, out_features=8, bias=True)
  )
)

In [14]:
with open(equiv_dataset_path, "rb") as f:
    equiv_dataset: SpeechEquivalenceDataset = pickle.load(f)

In [20]:
with open(state_space_spec_path, "rb") as f:
    state_space_spec: StateSpaceAnalysisSpec = pickle.load(f)

# DEV: retain just the samples available in this subset
spans = [[(start, end) for start, end in spans_i if end < equiv_dataset.hidden_state_dataset.num_frames]
         for spans_i in state_space_spec.target_frame_spans]
retain_idxs = [idx for idx, spans_i in enumerate(spans) if len(spans_i) > 0]
state_space_spec = replace(state_space_spec,
                           target_frame_spans=[spans[i] for i in retain_idxs],
                           labels=[state_space_spec.labels[i] for i in retain_idxs],
                           total_num_frames=equiv_dataset.hidden_state_dataset.num_frames)
    
assert state_space_spec.is_compatible_with(equiv_dataset)

In [23]:
model_representations = load_or_compute_embeddings(model, equiv_dataset, model_dir, equiv_dataset_path)

out/embedding_cache/outputs-models-w2v2_8-syllable-data-timit_equiv_phoneme_within_word_prefix_1.pkl.npy


In [24]:
# Retain only syllables with N or more instances
retain_n = 5
retain_idxs = [idx for idx, target_frames in enumerate(state_space_spec.target_frame_spans)
               if len(target_frames) >= retain_n]
state_space_spec = replace(state_space_spec,
    labels=[state_space_spec.labels[i] for i in retain_idxs],
    target_frame_spans=[state_space_spec.target_frame_spans[i] for i in retain_idxs],
)

In [25]:
trajectory = prepare_state_trajectory(model_representations, state_space_spec, pad=np.nan)
lengths = [np.isnan(traj_i[:, :, 0]).argmax(axis=1) for traj_i in trajectory]

In [26]:
final_frames = [traj_i[np.arange(len(traj_i)), length_i - 1] for traj_i, length_i in zip(trajectory, lengths)]

## Compute syllable edit distance

In [27]:
syllables = state_space_spec.labels

In [28]:
# Function to calculate edit distance between two sequences of phonemes
def edit_distance(s1, s2):
    # Using Wagner-Fischer algorithm for computing edit distance
    m, n = len(s1), len(s2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0:
                dp[i][j] = j   # Deletion
            elif j == 0:
                dp[i][j] = i   # Insertion
            else:
                cost = 0 if s1[i - 1] == s2[j - 1] else 1
                dp[i][j] = min(dp[i - 1][j] + 1,      # Deletion
                               dp[i][j - 1] + 1,      # Insertion
                               dp[i - 1][j - 1] + cost) # Substitution
    return dp[m][n]

In [29]:
distances = np.zeros((len(syllables), len(syllables)))
syllable2idx = {s: i for i, s in enumerate(syllables)}
for i, s1 in enumerate(syllables):
    for j, s2 in enumerate(syllables):
        distances[i, j] = edit_distance(s1, s2)

In [31]:
distances.shape

(473, 473)

## Prepare regression analysis

In [32]:
Xs, y = [], []
max_num_pairs, max_num_samples = 1000, 100

syllable_pairs = list(itertools.combinations(list(range(len(syllables))), 2))
np.random.shuffle(syllable_pairs)
syllable_pairs = syllable_pairs[:max_num_pairs]

for s1, s2 in tqdm(syllable_pairs):
    frame_pairs = list(itertools.product(list(range(len(final_frames[s1]))), list(range(len(final_frames[s2])))))
    np.random.shuffle(frame_pairs)
    frame_pairs = frame_pairs[:max_num_samples]
                       
    for f1, f2 in frame_pairs:
        Xs.append(np.concatenate([final_frames[s1][f1], final_frames[s2][f2]]))
        y.append(distances[s1, s2])

X = np.stack(Xs)
y = np.array(y).astype(float)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [47]:
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.model_selection import cross_val_score, KFold

In [39]:
model = MLPRegressor(hidden_layer_sizes=(10,), max_iter=1000)
scores = cross_val_score(model, X, y, cv=KFold(5, shuffle=True), scoring="r2")

In [40]:
scores

array([0.05810032, 0.06242847, 0.06203571, 0.07060511, 0.06393827])

## Reduced regression analysis: cosine distance as metric

In [50]:
Xs, y = [], []
max_num_pairs, max_num_samples = 1000, 100

syllable_pairs = list(itertools.combinations(list(range(len(syllables))), 2))
np.random.shuffle(syllable_pairs)
syllable_pairs = syllable_pairs[:max_num_pairs]

for s1, s2 in tqdm(syllable_pairs):
    frame_pairs = list(itertools.product(list(range(len(final_frames[s1]))), list(range(len(final_frames[s2])))))
    np.random.shuffle(frame_pairs)
    frame_pairs = frame_pairs[:max_num_samples]
                       
    for f1, f2 in frame_pairs:
        Xs.append(distance.cosine(final_frames[s1][f1], final_frames[s2][f2]))
        y.append(distances[s1, s2])

X = np.stack(Xs)
y = np.array(y).astype(float)

X -= X.mean()
X /= X.std()

  0%|          | 0/1000 [00:00<?, ?it/s]

In [51]:
model = RidgeCV(cv=KFold(5, shuffle=True))
scores = cross_val_score(model, X[:, None], y, cv=KFold(5, shuffle=True), scoring="r2")

In [52]:
scores

array([ 0.00044382, -0.00055969,  0.00010893,  0.00017792,  0.00025267])