In [None]:
!pip3 install snac
!pip install soundfile
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install tqdm

In [1]:
import torch, torchaudio
from speech_tokenizer import SpeechTokenizer
import numpy as np
from tqdm import tqdm
import itertools
from pathlib import Path
import os

In [2]:
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

def batch_list(lst, batch_size):
    it = iter(lst)
    return iter(lambda: list(itertools.islice(it, batch_size)), [])

Path('./data').mkdir(parents=True, exist_ok=True)

using device: mps


In [3]:
tokenizer = SpeechTokenizer(device=device)

In [4]:
seconds_per_batch = 3
batch_size = 2
print("batch size:", batch_size)

for audio_path in sorted(os.listdir('./adventures_sherlock_holmes_rg_librivox')):
    print("processing: ", audio_path)
    waves = []
    waveform, sample_rate = torchaudio.load(f'./adventures_sherlock_holmes_rg_librivox/{audio_path}', backend='soundfile')

    # Resample to 24kHz if necessary
    if sample_rate != tokenizer.sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=tokenizer.sample_rate)
        waveform = resampler(waveform)

    # Convert to mono by averaging the channels if the audio is stereo
    if waveform.size(0) > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    i = 0
    while 10*(i+1)*tokenizer.sample_rate < waveform.shape[-1]:
        waves.append(waveform[:, tokenizer.sample_rate*seconds_per_batch*i : tokenizer.sample_rate*seconds_per_batch*(i+1)])
        i+=1
    waves.append(waveform[:, tokenizer.sample_rate*seconds_per_batch*i : ])
    
    batches = list(batch_list(waves, batch_size))
    # batches = batch_list(waves, batch_size)

    single_doc = []
    for batch in tqdm(batches[:-1]):
        encoded_batch = tokenizer.encode(batch)
        for x in encoded_batch:
            single_doc.extend(x[:-1])

    if audio_path.split('_')[1] == '01':
        split = 'val'
    else:
        split = 'train'
    np.save(f"./data/sherlock_{split}_{audio_path.split('_')[1]}", single_doc)

batch size: 2
processing:  adventuresholmes_01_doyle_64kb.mp3


100%|██████████| 83/83 [00:16<00:00,  5.08it/s]


processing:  adventuresholmes_02_doyle_64kb.mp3


100%|██████████| 117/117 [00:22<00:00,  5.27it/s]


processing:  adventuresholmes_03_doyle_64kb.mp3


100%|██████████| 109/109 [00:20<00:00,  5.26it/s]


processing:  adventuresholmes_04_doyle_64kb.mp3


100%|██████████| 96/96 [00:18<00:00,  5.14it/s]


processing:  adventuresholmes_05_doyle_64kb.mp3


100%|██████████| 79/79 [00:15<00:00,  5.26it/s]


processing:  adventuresholmes_06_doyle_64kb.mp3


100%|██████████| 84/84 [00:15<00:00,  5.27it/s]


processing:  adventuresholmes_07_doyle_64kb.mp3


100%|██████████| 121/121 [00:22<00:00,  5.30it/s]


processing:  adventuresholmes_08_doyle_64kb.mp3


100%|██████████| 99/99 [00:16<00:00,  5.97it/s]


processing:  adventuresholmes_09_doyle_64kb.mp3


100%|██████████| 93/93 [00:17<00:00,  5.31it/s]


processing:  adventuresholmes_10_doyle_64kb.mp3


100%|██████████| 73/73 [00:13<00:00,  5.28it/s]


processing:  adventuresholmes_11_doyle_64kb.mp3


100%|██████████| 119/119 [00:22<00:00,  5.32it/s]


processing:  adventuresholmes_12_doyle_64kb.mp3


100%|██████████| 98/98 [00:18<00:00,  5.27it/s]


processing:  adventuresholmes_13_doyle_64kb.mp3


100%|██████████| 88/88 [00:16<00:00,  5.29it/s]


processing:  adventuresholmes_14_doyle_64kb.mp3


100%|██████████| 103/103 [00:19<00:00,  5.33it/s]


processing:  adventuresholmes_15_doyle_64kb.mp3


100%|██████████| 108/108 [00:20<00:00,  5.35it/s]


processing:  adventuresholmes_16_doyle_64kb.mp3


100%|██████████| 120/120 [00:22<00:00,  5.30it/s]


processing:  adventuresholmes_17_doyle_64kb.mp3


100%|██████████| 93/93 [00:17<00:00,  5.32it/s]


processing:  adventuresholmes_18_doyle_64kb.mp3


100%|██████████| 88/88 [00:16<00:00,  5.32it/s]


processing:  adventuresholmes_19_doyle_64kb.mp3


100%|██████████| 105/105 [00:19<00:00,  5.30it/s]


processing:  adventuresholmes_20_doyle_64kb.mp3


100%|██████████| 92/92 [00:17<00:00,  5.31it/s]


processing:  adventuresholmes_21_doyle_64kb.mp3


100%|██████████| 104/104 [00:19<00:00,  5.31it/s]


processing:  adventuresholmes_22_doyle_64kb.mp3


100%|██████████| 120/120 [00:22<00:00,  5.34it/s]


processing:  adventuresholmes_23_doyle_64kb.mp3


100%|██████████| 96/96 [00:18<00:00,  5.29it/s]


processing:  adventuresholmes_24_doyle_64kb.mp3


100%|██████████| 129/129 [00:21<00:00,  5.95it/s]


In [8]:
# 1.3M tokens for roughly 12hrs
total_count = 0
for x in os.listdir('./data'):
    data = np.load(f'./data/{x}').shape[0]
    print(f'./data/{x}', data, 'tokens')
    total_count += data
print(total_count)

./data/sherlock_train_02.npy 67392 tokens
./data/sherlock_train_16.npy 69120 tokens
./data/sherlock_train_17.npy 53568 tokens
./data/sherlock_train_03.npy 62784 tokens
./data/sherlock_train_15.npy 62208 tokens
./data/sherlock_train_14.npy 59328 tokens
./data/sherlock_train_10.npy 42048 tokens
./data/sherlock_train_04.npy 55296 tokens
./data/sherlock_train_05.npy 45504 tokens
./data/sherlock_train_11.npy 68544 tokens
./data/sherlock_train_07.npy 69696 tokens
./data/sherlock_train_13.npy 50688 tokens
./data/sherlock_train_12.npy 56448 tokens
./data/sherlock_train_06.npy 48384 tokens
./data/sherlock_val_01.npy 47808 tokens
./data/sherlock_train_23.npy 55296 tokens
./data/sherlock_train_22.npy 69120 tokens
./data/sherlock_train_20.npy 52992 tokens
./data/sherlock_train_08.npy 57024 tokens
./data/sherlock_train_09.npy 53568 tokens
./data/sherlock_train_21.npy 59904 tokens
./data/sherlock_train_19.npy 60480 tokens
./data/sherlock_train_24.npy 74304 tokens
./data/sherlock_train_18.npy 50688 t

In [18]:
print(np.load(f'./data/sherlock_val_01.npy')[:128].tolist())

[9999, 547, 426, 2825, 1441, 2209, 1300, 161, 9999, 1646, 3418, 1667, 874, 2156, 1337, 883, 9999, 717, 4081, 1667, 2030, 2110, 429, 953, 9999, 717, 1029, 429, 4009, 690, 2486, 3909, 9999, 3718, 1404, 1667, 3203, 1604, 1441, 883, 9999, 3919, 2209, 3830, 3741, 2918, 2202, 2175, 9999, 456, 318, 2544, 2024, 1874, 3246, 1747, 9999, 2962, 3069, 940, 685, 511, 3978, 484, 9999, 4056, 2450, 1926, 105, 782, 518, 2308, 9999, 1212, 863, 3777, 731, 3343, 3764, 1216, 9999, 1792, 124, 2744, 251, 3822, 1900, 125, 9999, 2439, 455, 3684, 2570, 915, 158, 1234, 9999, 3278, 2085, 2068, 2531, 2842, 2388, 3666, 9999, 1482, 1635, 801, 2920, 1978, 2866, 2934, 9999, 3563, 1713, 1069, 3393, 3740, 3070, 583, 9999, 2622, 500, 2265, 2101, 1723, 1286, 3685]
