In [1]:
import torch
import torch.nn as nn

In [2]:
import os
os.environ["HF_DATASETS_CACHE"] = "F:/cache"


In [1]:
from datasets import load_dataset
ds = load_dataset("m-aliabbas/idrak_timit_subsample1")


  from .autonotebook import tqdm as notebook_tqdm
Downloading metadata: 100%|██████████| 1.15k/1.15k [00:00<00:00, 4.86kB/s]
Downloading data: 100%|██████████| 29.5M/29.5M [00:02<00:00, 10.2MB/s]
Downloading data: 100%|██████████| 1.88M/1.88M [00:00<00:00, 4.48MB/s]
Generating train split: 100%|██████████| 1296/1296 [00:00<00:00, 7397.14 examples/s]
Generating test split: 100%|██████████| 324/324 [00:00<00:00, 9919.52 examples/s]


In [15]:
data = ds['train']

In [32]:
data[0]

{'audio': {'path': None,
  'array': array([2.58134114e-05, 1.75004868e-06, 1.89941275e-05, ...,
         2.67780015e-05, 9.33140836e-05, 9.90217522e-05]),
  'sampling_rate': 22050},
 'transcription': 'don t ask me to carry an oily rag like that'}

In [24]:
from IPython.display import Audio
Audio(data['audio'][0]['array'],rate=22050)

In [27]:
data['audio'][0]['array'].shape

(49392,)

In [26]:
data['transcription'][0]

'don t ask me to carry an oily rag like that'

In [33]:
from torch.utils.data import Dataset
import torch

class TTSDataset(Dataset):
    def __init__(self, data, text_tokenizer, max_audio_samples=44000, max_text_length=100):
        super().__init__()
        self.data = data
        self.tokenizer = text_tokenizer
        self.max_audio_samples = max_audio_samples
        self.max_text_length = max_text_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item["transcription"]
        audio = item["audio"]['array']

        # Tokenize text with padding and truncation
        tokenized = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_text_length,
            return_tensors="pt"
        )

        input_ids = tokenized["input_ids"].squeeze(0)
        attention_mask = tokenized["attention_mask"].squeeze(0)

        # Ensure audio is a 1D torch tensor
        audio_tensor = torch.tensor(audio, dtype=torch.float32)

        # Pad or truncate audio
        if audio_tensor.shape[0] > self.max_audio_samples:
            audio_tensor = audio_tensor[:self.max_audio_samples]
        elif audio_tensor.shape[0] < self.max_audio_samples:
            padding = self.max_audio_samples - audio_tensor.shape[0]
            audio_tensor = torch.nn.functional.pad(audio_tensor, (0, padding))

        return {
            "input_ids": input_ids,
            "audio": audio_tensor
        }


In [None]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
dataset = TTSDataset(data, tokenizer)
loader = DataLoader(dataset, batch_size=2)

for batch in loader:
    print(batch["input_ids"].shape)
    print(batch["audio"].shape)

In [None]:
from Audiotokenizer import AudioTokenizer
from transformer import Transformer

audio_tokenizer_config = {
    "strides": [2, 4, 4],
    "input_channels": 1,
    "hidden_channels": 64,
    "latent_channels": 128,
    "kernel": 3,
    "num_codebook": 2,
    "codebook_size": 1024,
    "codebook_dim": 128
}


transformer_config = {
    "model": {
        "num_blocks": 2,
        "hidden_dims": 256,
        "num_heads_q": 8,
        "num_heads_kv": 4,
        "seq_length": 4096,
        "ffn_multiplier": 4,
        "vocab_size": 52000,
        "eps": 1e-5,
        "head_dim": 256//8
    }
}

