In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
import itertools

import datasets
import transformers
import torch

from src.utils.timit import load_or_prepare_timit_corpus

In [4]:
model_name = "facebook/wav2vec2-base"

equivalence_classer = "phoneme"
num_frames_per_phoneme = 1

layer = 6
output_dim = 32

equiv_dataset_path = f"data/timit_equiv_{equivalence_classer}_{layer}_{num_frames_per_phoneme}.pkl"
output_dir = f"out/ce_model_{equivalence_classer}_{layer}_{output_dim}"

In [5]:
tokenizer = transformers.Wav2Vec2Tokenizer.from_pretrained("charsiu/tokenizer_en_cmu")
feature_extractor = transformers.Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = transformers.Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


In [6]:
dataset = load_or_prepare_timit_corpus("data/timit_syllables", "data/timit_raw",
                                       processor)

def add_indices(item, idx):
    item["idx"] = idx
    return item
dataset = dataset.map(add_indices, batched=True, batch_size=2000, with_indices=True)

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/4620 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

In [7]:
dataset["train"][1561]["word_detail"]

{'start': [3898, 6040, 9880, 12280, 12280, 15894, 18046, 21933, 25860, 28702],
 'stop': [6040, 9880, 12280, 12280, 17343, 18046, 21933, 25860, 28702, 34500],
 'utterance': ["don't",
  'ask',
  'me',
  'to',
  'carry',
  'an',
  'oily',
  'rag',
  'like',
  'that']}

In [8]:
dataset["train"][1561]["word_syllable_detail"]

[[{'idx': 0,
   'phoneme_end_idx': 4,
   'phoneme_start_idx': 0,
   'phones': ['D', 'OW', 'N', 'T'],
   'start': 3898,
   'stop': 6040,
   'stress': None}],
 [{'idx': 0,
   'phoneme_end_idx': 3,
   'phoneme_start_idx': 0,
   'phones': ['AE', 'S', 'K'],
   'start': 6040,
   'stop': 9880,
   'stress': None}],
 [{'idx': 0,
   'phoneme_end_idx': 2,
   'phoneme_start_idx': 0,
   'phones': ['M', 'IH'],
   'start': 9880,
   'stop': 12280,
   'stress': None}],
 [],
 [{'idx': 0,
   'phoneme_end_idx': 2,
   'phoneme_start_idx': 0,
   'phones': ['K', 'EH'],
   'start': 12280,
   'stop': 14907,
   'stress': None},
  {'idx': 1,
   'phoneme_end_idx': 4,
   'phoneme_start_idx': 2,
   'phones': ['R', 'IH'],
   'start': 14907,
   'stop': 17343,
   'stress': None}],
 [{'idx': 0,
   'phoneme_end_idx': 2,
   'phoneme_start_idx': 0,
   'phones': ['IH', 'N'],
   'start': 15894,
   'stop': 18046,
   'stress': None}],
 [{'idx': 0,
   'phoneme_end_idx': 1,
   'phoneme_start_idx': 0,
   'phones': ['OY'],
   'st

In [9]:
dataset["train"][1561]["word_phonemic_detail"]

[[{'idx_in_syllable': 0,
   'phone': 'D',
   'start': 3898,
   'stop': 4410,
   'stress': None,
   'syllable_idx': 0,
   'syllable_phones': ['D', 'OW', 'N', 'T'],
   'syllable_start': 3898,
   'syllable_stop': 6040},
  {'idx_in_syllable': 1,
   'phone': 'OW',
   'start': 4410,
   'stop': 5560,
   'stress': None,
   'syllable_idx': 0,
   'syllable_phones': ['D', 'OW', 'N', 'T'],
   'syllable_start': 3898,
   'syllable_stop': 6040},
  {'idx_in_syllable': 2,
   'phone': 'N',
   'start': 5560,
   'stop': 6040,
   'stress': None,
   'syllable_idx': 0,
   'syllable_phones': ['D', 'OW', 'N', 'T'],
   'syllable_start': 3898,
   'syllable_stop': 6040},
  {'idx_in_syllable': 3,
   'phone': 'T',
   'start': 5560,
   'stop': 6040,
   'stress': None,
   'syllable_idx': 0,
   'syllable_phones': ['D', 'OW', 'N', 'T'],
   'syllable_start': 3898,
   'syllable_stop': 6040}],
 [{'idx_in_syllable': 0,
   'phone': 'AE',
   'start': 6040,
   'stop': 8146,
   'stress': None,
   'syllable_idx': 0,
   'syllabl