In [16]:
import torch
import torch.nn as nn

from argparse import Namespace

    This is a simple implementation of the TDNN without subsampling described in the paper  "A time delay neural network architecture for efficient modeling of long temporal contexts" by Peddinti et al. (2015), and used in "Time delay deep neural network-based universal background models for speaker recognition" by Snyder et al. (2015). 

In [17]:
param = Namespace()

param.T = 23 # the number of time frames
param.dim_mfcc = 40 # the number of mfcc features

In [18]:
def norm_2(x):
    x = x**2
    x = torch.sum(x, 1)
    x = torch.sqrt(x)
    x = x.unsqueeze(1)
    return x

In [19]:
class TDNN(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(TDNN, self).__init__()
        self.layer1 = nn.Conv1d(1, 1, stride=1, kernel_size=5)
        self.layer2 = nn.Conv1d(1, 1, stride=1, kernel_size=4)
        self.layer3 = nn.Conv1d(1, 1, stride=1, kernel_size=7)
        self.layer4 = nn.Conv1d(1, 1, stride=1, kernel_size=10)

    def forward(self, x):
        x = self.layer1(x)
        x = norm_2(x)
        x = self.layer2(x)
        x = norm_2(x)
        x = self.layer3(x)
        x = norm_2(x)
        x = self.layer4(x)
        x = norm_2(x)
        x = x.view(-1, x.size(1) * x.size(2))
        return x

In [20]:
# test
tdnn = TDNN(param.dim_mfcc, 10)

# mfcc features of 23 time steps, (1, 40) for each time step ( image of dimension (1, 23, 13) )
x = torch.randn(param.dim_mfcc, 1, param.T)

output = tdnn(x)

In [21]:
print("input shape:", x.size())

print("output shape:", output.size())

input shape: torch.Size([40, 1, 23])
output shape: torch.Size([40, 1])


    The ouput of this TDNN is a vector featuring all the infomation in a temporal context window of 23 time steps.