In [169]:
import h5py
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeCV
from tqdm.auto import tqdm, trange

In [170]:
hubert_encoding_path = "timit_hubert_encodings.h5"
hubert_agg_fn = "mean"

word_encoding_path = "word_encodings/nce.h5"
out_path = "results/nce.csv"

In [171]:
# Load hubert encodings and transform into same format as word encodings
hubert_encodings, hubert_encoding_ids = [], []
with h5py.File(hubert_encoding_path, "r") as f:
    for dialect in f.keys():
        for speaker in tqdm(f[dialect].keys()):
            for sentence in f[dialect][speaker].keys():
                sentence_encodings = f[dialect][speaker][sentence]["representations"]["word"][hubert_agg_fn]
                for token, encodings in enumerate(sentence_encodings):
                    hubert_encodings.append(encodings)
                    hubert_encoding_ids.append((speaker, sentence, str(token)))
    

  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/77 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

In [172]:
hubert_encodings = np.array(hubert_encodings)
hubert_encoding_ids = np.array(hubert_encoding_ids).astype("S")

In [173]:
with h5py.File("word_encodings/autoencoder.h5", "r") as f:
    word_encodings = f["encodings"][()]
    word_encoding_ids = f["ids"][()]

In [174]:
keep_ids = set([tuple(id_tup) for id_tup in hubert_encoding_ids]) \
    & set([tuple(id_tup) for id_tup in word_encoding_ids])
len(keep_ids)

33887

In [175]:
hubert_encoding_mask = np.array([tuple(id_tup) in keep_ids for id_tup in hubert_encoding_ids])
hubert_encodings = hubert_encodings[hubert_encoding_mask]
hubert_encoding_ids = hubert_encoding_ids[hubert_encoding_mask]

word_encoding_mask = np.array([tuple(id_tup) in keep_ids for id_tup in word_encoding_ids])
word_encodings = word_encodings[word_encoding_mask]
word_encoding_ids = word_encoding_ids[word_encoding_mask]

assert hubert_encodings.shape[0] == word_encodings.shape[0]

## Learn single-layer encoder

In [176]:
# TODO why are there NaN values?
np.isnan(hubert_encodings[:, 1, :]).mean()

0.00014754920766075485

In [177]:
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
pca_dim = 32

In [178]:
clfs = []
alphas = np.logspace(3, 11, 9)
for layer in trange(hubert_encodings.shape[1]):
    X = word_encodings
    X -= X.mean(axis=0)
    X /= X.std(axis=0)

    Y = hubert_encodings[:, layer, :]
    Y[np.isnan(Y)] = 0
    Y -= Y.mean(axis=0)
    Y /= Y.std(axis=0)
    
    Y_pca = PCA(pca_dim).fit_transform(Y)
    X_pca = PCA(pca_dim).fit_transform(X)

    clfs.append(RidgeCV(alphas=alphas, cv=KFold(4, shuffle=True),
                        fit_intercept=False,
                        scoring="neg_mean_squared_error").fit(X_pca, Y_pca))

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
results = pd.DataFrame([clf.best_score_ for clf in clfs], columns=["mse"])
results.index.name = "layer"
results["mse"] = -results["mse"]
results

Unnamed: 0_level_0,mse
layer,Unnamed: 1_level_1
0,22.048445
1,16.185552
2,13.583222
3,13.89739
4,13.760206
5,12.947767
6,13.376405
7,12.780232
8,14.350254
9,21.650338


In [None]:
results.to_csv(out_path)