# Installing wavencoder

In [1]:
import torchaudio

In [2]:
import sys
sys.path.append("../../")

import torchaudio
torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE = False
torchaudio.set_audio_backend("soundfile")

  '"torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE" flag is deprecated and will be removed in 0.9.0. '


# Wav2Vec pretrained feature extractor

In [3]:
import torch
import wavencoder


  return torch._C._cuda_getDeviceCount() > 0


In [4]:
x = torch.randn(1, 16000) # [1, 16000]
encoder = wavencoder.models.Wav2Vec(pretrained=False)
z = encoder(x) # [1, 512, 98]
z.shape

torch.Size([1, 512, 98])

# SincNet pretrained feature extractor

In [5]:
from wavencoder.models import SincNet
encoder = SincNet(pretrained=False).eval()
x = torch.randn(1, 3200) 
z = encoder(x)
print(z.shape)

torch.Size([1, 2048])


In [6]:
z

tensor([[-1.7592e-05,  4.9438e-05, -6.8345e-06,  ...,  1.7641e-05,
          1.1897e-04,  4.3041e-05]], grad_fn=<LeakyReluBackward0>)

# RawNet

In [11]:
import torch
import wavencoder

x = torch.randn(1, 59049) # [1, 16000]
rawnet_encoder = wavencoder.models.RawNet2Model(pretrained=False, return_code=True, class_dim=100)
z = rawnet_encoder(x) # [1, 1024]
z.shape

torch.Size([1, 1024])

# Audio Classifier
- wav2vec encoder `[1, 16000] -> [1, 512, 98]`
- mean of features along time axis `[1, 512, 98] -> [1, 512]`
- ANN Classifier `[1, 512] -> [1, 2]`


In [12]:
import torch
import torch.nn as nn
import wavencoder

class AudioClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = wavencoder.models.Wav2Vec(pretrained=False)
        self.classifier = nn.Linear(512, 2)

    def forward(self, x):
        z = self.encoder(x)
        z = torch.mean(z, dim=2)
        out = self.classifier(z)
        return out

model = AudioClassifier()
x = torch.randn(1, 16000)
y_hat = model(x)
print(y_hat.shape)

torch.Size([1, 2])


- SincNet encoder `[1, 3200] -> [1, 2048]`
- ANN Classifier `[1, 2048] -> [1, 512]`

In [17]:
import torch
import torch.nn as nn
import wavencoder

class SincNetAudioClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = SincNet(pretrained=False)
        self.classifier = nn.Linear(2048, 2)

    def forward(self, x):
        z = self.encoder(x)
        out = self.classifier(z)
        return out

model = SincNetAudioClassifier()
x = torch.randn(2, 3200)
y_hat = model(x)
print(y_hat.shape)

torch.Size([2, 2])


# LSTM Attention Classifier

In [18]:
import torch
import torch.nn as nn
import wavencoder

model = nn.Sequential(
        wavencoder.models.Wav2Vec(pretrained=False),
        wavencoder.models.LSTM_Attn_Classifier(512, 64, 2, return_attn_weights=True, attn_type='soft')
)

x = torch.randn(5, 16000)
y_hat, attn_weights = model(x)

print(y_hat.shape, attn_weights.shape)

torch.Size([5, 2]) torch.Size([5, 98])
