In [None]:
from datasets.speaker_audio_dataset import SpeakerAudioDataset
from model.layers.lstmp import LSTMPCell

In [34]:


model_params = {
    'input_size': 80,
    'hidden_size': 257,
    'projection_size': 256,
    'embedding_size': 256,
    'num_layers': 3
}

In [3]:
dataset = SpeakerAudioDataset('../data/utterance_corpuses/LibriTTS/dev-clean', sample_rate, mel_params)
test_Y, test_X = dataset[0]

In [None]:
import os
import numpy as np
import librosa
from torch.utils.data import Dataset

class SpeakerAudioDataset(Dataset):
    def __init__(self, root_dir, sample_rate, mel_params):
        self.root_dir = root_dir
        self.sample_rate = sample_rate
        self.mel_params = mel_params
        self.utterances = []
        
        for root, dirs, files in os.walk(root_dir):
            for file in files:
                if file[-3:] == 'wav':
                    info = file.split('_')
                    if len(info) == 4:
                        self.utterances.append([
                            info[0], info[1], info[2]+'_'+info[3]
                        ])
 
        # audio
        # | speaker_id | chapter_id | utterance_id | frame_id | ... 80 | 
        
        # text
        # | speaker_id | chapter_id | utterance_id | char_id | char_embed |
        
    def __len__(self):
        return len(self.utterances)
        # give length of all samples
        
    def __getitem__(self, idx):
        utterance = self.utterances[idx]
        y, _ = librosa.load(f'{self.root_dir}/{utterance[0]}/{utterance[1]}/{"_".join(utterance)}')
        mel_spec = librosa.feature.melspectrogram(y, sr=self.sample_rate, **self.mel_params)
        return utterance[0], mel_spec.swapaxes(0, 1)

In [62]:
import torch
from torch import nn
from torch.nn import functional as func

class SpeakerVerificationLSTMEncoder(nn.Module):
    def __init__(self, 
                 input_size, 
                 hidden_size, 
                 projection_size, 
                 embedding_size,
                 num_layers
                ):
        super(SpeakerVerificationLSTMEncoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.projection_size = projection_size
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            self.input_size, 
            self.hidden_size, 
            self.num_layers, 
            proj_size=self.projection_size,
            batch_first=True
        )
        
        self.linear = nn.Linear(
            in_features=self.projection_size, 
            out_features=self.embedding_size
        )
        
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # (64, 636, 80)
        
        # lstm with projection
        _, (hx, cx) = self.lstm(x)
        
        # linear layer w/ relu
        x = self.relu(self.linear(hx[-1]))
        
        # l2 normalize
        x = func.normalize(x, p=2, dim=1)
        
        return x

In [61]:
# generate (xj, (xk1, ...xkM))
# if j = k (speakers), positive
# else negative
# generate pos/neg alternatively
# compute l2 norm response from lstm
# (ej, (ek1, ...ekM))
# compute centroid of (ek1, ...ekM), ckM

x = model(torch.randn(64, 636, 80))
# compute centroids of each row
x, x.shape

(tensor([[0.0318, 0.0099, 0.0000,  ..., 0.0507, 0.0000, 0.0421],
         [0.0324, 0.0092, 0.0000,  ..., 0.0505, 0.0000, 0.0423],
         [0.0307, 0.0107, 0.0000,  ..., 0.0530, 0.0000, 0.0432],
         ...,
         [0.0356, 0.0106, 0.0000,  ..., 0.0518, 0.0000, 0.0440],
         [0.0327, 0.0093, 0.0000,  ..., 0.0526, 0.0000, 0.0429],
         [0.0339, 0.0103, 0.0000,  ..., 0.0515, 0.0000, 0.0410]],
        grad_fn=<DivBackward0>),
 torch.Size([64, 256]))

In [3]:
from datasets import SpeakerAudioDataSet
from dataloaders import SpeakerAudioDataLoader
from models import SpeakerVerificationLSTMEncoder
from transforms import Mel_Spec, Clip_Shuffle
import transforms.transform_utils

sample_rate = 22050
mel_params = {
    'sample_rate': sample_rate,
    'n_fft': int(1024 * (sample_rate / 16000)),
    'hop_length': int(256 * (sample_rate / 16000)),
    'win_length': int(1024 * (sample_rate / 16000)),
    'n_mels': 80
}

train_params = {
    'N_speakers': 64,
    'M_utterances': 10,
    'sources': {
        'LibriTTS': {
            'version': 'dev-clean'
        }
    }
}

model_params = {
    'input_size': 80,
    'hidden_size': 257,
    'projection_size': 256,
    'embedding_size': 256,
    'num_layers': 3
}

batch_size = train_params['N_speakers'] / train_params['M_utterances']

ImportError: cannot import name 'SpeakerAudioDataSet' from 'datasets' (unknown location)

In [63]:
dataset = SpeakerAudioDataset(
        root='../data/utterance_corpuses',
        transform=[
            Mel_Spec(mel_params),
            Clip_Shuffle(clip_params),
            transform_utils.To_Tensor(),
        ]
)

dataloader = SpeakerAudioDataLoader(
    dataset, 
    train_params['N_speakers'], 
    train_params['M_utterances'], 
    train_params['sources'], 
    shuffle=True
)

# transforms ToTensor

epochs = 3
total_speakers, total_samples = len(dataset)
n_iterations = math.ceil(total_samples / batch_size)

model = SpeakerVerificationLSTMEncoder(**model_params)

for epoch in range(epochs):
      
    cks = torch.Tensor(256,                                                                                                                                 
    [
        torch.init_some_shit_here(embedding_dims)
        for speaker in total_speakers
    ]
                                                                                                                                       
    for i, (speaker, audio) in enumerate(dataloader):
        embed = model(audio)
                                                                                                                                       
        
        # get batch_size examples
        # forward/backwards + update
        # if i % 5 == 0 (ex):
        #     print/visualize/progress
        #     e.g. epoch, step, input_size, graph, convergence
        

SyntaxError: invalid syntax (1397870347.py, line 51)

In [10]:
import os

os.path.join('../data/utterance_corpuses', 'LibriTTS', 'dev-clean')

'../data/utterance_corpuses/LibriTTS/dev-clean'