In [1]:
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import librosa as rs

import numpy as np
import matplotlib.pyplot as plt   
import IPython.display as ipd

import pdb

In [75]:
w = torch.hamming_window(400)
for i in range(20) : 
    print(w[i].numpy())

0.08000001
0.08005676
0.08022702
0.080510676
0.08090773
0.08141804
0.0820415
0.08277798
0.083627254
0.084589124
0.08566338
0.08684972
0.08814788
0.0895575
0.09107831
0.09270987
0.094451755
0.09630361
0.0982649
0.10033521


In [50]:
class PreEmphasis(torch.nn.Module):
    def __init__(self, coef: float = 0.97):
        super(PreEmphasis, self).__init__()
        self.coef = coef
        # make kernel
        # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
        self.register_buffer(
            'flipped_filter', torch.FloatTensor(
                [-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
        )

    def forward(self, inputs: torch.tensor) -> torch.tensor:
        assert len(
            inputs.size()) == 2, 'The number of dimensions of inputs tensor must be 2!'
        # reflect padding to match lengths of in/out
        inputs = inputs.unsqueeze(1)
        inputs = F.pad(inputs, (1, 0), 'reflect')
        return F.conv1d(inputs, self.flipped_filter).squeeze(1)


class Mel_Spectrogram(nn.Module):
    def __init__(self, sample_rate=16000, n_fft=512, win_length=400, hop_length=160, n_mels=80, coef=0.97, **kwargs):
        super(Mel_Spectrogram, self).__init__()
        self.sample_rate = sample_rate
        self.n_fft = n_fft
        self.win_length = win_length
        self.hop_length = hop_length
        self.n_mels = n_mels

        self.pre_emphasis = PreEmphasis(coef)
        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=self.sample_rate, \
        n_fft=self.n_fft,\
        win_length=self.win_length,\
        hop_length=self.hop_length, \
        n_mels=self.n_mels, \
        f_min = 20, f_max = 7600, \
        window_fn=torch.hamming_window, )

        self.spec = torchaudio.transforms.Spectrogram(
                n_fft=self.n_fft,
                win_length=self.win_length,
                hop_length=self.hop_length,
                pad=0,
                window_fn=torch.hamming_window,
                power= 2.0,
                normalized=False,
                center=True,
                pad_mode="reflect",
                onesided=True,
            )

    def forward(self, x):
        """
        Args:
            x (torch.Tensor): (batch, time)
        Returns:
            x (torch.Tensor): (batch, n_mels, time)
        """
        with torch.no_grad():
            x = self.pre_emphasis(x)
            x = self.mel_spectrogram(x) + 1e-6
            x = torch.log(x)

            m_val = torch.mean(x, dim=-1, keepdim=True)
            print(x.shape)
            x = x - m_val
            print(m_val.shape)
            print(m_val)
            
        return x

In [51]:
torch.set_printoptions(precision=7) 

x,_ = rs.load("../data/female_1.wav",sr=16000)
x = torch.from_numpy(x)
x = torch.unsqueeze(x,0)
m = Mel_Spectrogram()
y_py = m(x)
y_py = y_py.numpy()
print(y_py.shape)
print(y_py[0,:,0])

torch.Size([1, 80, 184])
torch.Size([1, 80, 1])
tensor([[[-9.8945370],
         [-9.4767332],
         [-9.1656551],
         [-7.5483580],
         [-5.5288100],
         [-4.3522944],
         [-3.3532805],
         [-3.5186572],
         [-4.6613145],
         [-5.8296981],
         [-6.2588892],
         [-5.7242770],
         [-4.4462571],
         [-3.7130642],
         [-4.1355147],
         [-4.5175509],
         [-4.7907228],
         [-5.3501396],
         [-4.6665592],
         [-4.3967118],
         [-4.2235179],
         [-4.7060695],
         [-5.3965516],
         [-5.8272638],
         [-5.6241918],
         [-4.9747667],
         [-5.4461174],
         [-6.1674986],
         [-5.5837674],
         [-4.9603219],
         [-5.2170663],
         [-5.7967763],
         [-4.9944015],
         [-4.8510447],
         [-5.1082530],
         [-5.0128889],
         [-4.5400772],
         [-4.6370921],
         [-4.6607661],
         [-4.2540245],
         [-4.2397151],
         

In [52]:
with open("../SpeakerNet/feature.txt","r") as f:
    y = f.readlines()
    #print(len(y))
    print("python  |  C++")
    for i in range(len(y)) : 
        val = y[i].split()
        #print(val)
        val = np.array(val, dtype=float)

        val_py = np.sum((np.abs(y_py[0,:,i])))
        val_cpp = np.sum(np.abs(val[:]))
        print(f"diff {val_py - val_cpp} | {y_py[0,:,i].shape} {val.shape}")
        for j in range(5) :
            print(f"value[{j}] {y_py[0,j,i]} {val[j]}")
        for j in range(75,80) :
            print(f"value[{j}] {y_py[0,j,i]} {val[j]}")
        if i > 10:
            break
    

python  |  C++
diff -5.629862473567982 | (80,) (80,)
value[0] -3.7723093032836914 -3.8179747247509
value[1] -3.9566612243652344 -3.9942975989971
value[2] -3.9447498321533203 -4.0026751105114
value[3] -5.320125579833984 -5.4087255736698
value[4] -7.076686382293701 -7.1971992280292
value[75] -2.621640205383301 -2.6379256810957
value[76] -2.2075462341308594 -2.2157474500469
value[77] -2.2985315322875977 -2.3148815643788
value[78] -3.786057949066162 -3.800559997656
value[79] -3.147352695465088 -3.1655578661813
diff -5.629887927727282 | (80,) (80,)
value[0] -3.9111785888671875 -3.9568445638112
value[1] -4.262996673583984 -4.3006321232528
value[2] -4.401649475097656 -4.4595741392067
value[3] -6.159921646118164 -6.248519821263
value[4] -8.10093879699707 -8.2214517298791
value[75] -3.066802978515625 -3.083087321364
value[76] -2.5777111053466797 -2.5859127989554
value[77] -2.841947555541992 -2.8582995800272
value[78] -3.0456433296203613 -3.0601481666645
value[79] -3.083024501800537 -3.101229377