In [1]:
from datasets import load_from_disk
from tqdm import tqdm

In [2]:
ds = load_from_disk('language-identification')
ds

Dataset({
    features: ['audio_id', 'language', 'audio', 'raw_text', 'normalized_text', 'gender', 'speaker_id', 'is_gold_transcript', 'accent'],
    num_rows: 300
})

In [3]:
from transformers import Wav2Vec2Model, Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')
wav_model = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base-960h')

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import torch

def process_wav(sample):
    enc = processor(sample['audio']['array'], sampling_rate=16_000, padding=True, return_tensors='pt')
    with torch.no_grad():
        out = wav_model(**enc).last_hidden_state.squeeze().mean(axis=0)
    return out, sample['language']

encodings = []
for sample in tqdm(ds):
    encodings.append(process_wav(sample))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [02:52<00:00,  1.74it/s]


In [9]:
from torch import nn


class CNN(nn.Module):
    def __init__(self, input_dim, num_classes, input_length=768):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(
            in_channels=input_dim, out_channels=128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(
            in_channels=128, out_channels=256, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(256 * input_length // 4, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = nn.functional.relu(self.conv1(x))
        x = nn.functional.max_pool1d(x, kernel_size=2)
        x = nn.functional.relu(self.conv2(x))
        x = nn.functional.max_pool1d(x, kernel_size=2)
        x = x.reshape(x.shape[0], -1)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = CNN(1, 3)

In [11]:
X = torch.stack([e[0] for e in encodings])
X.shape

torch.Size([300, 768])

In [24]:
model(X[0, :2].reshape(2, 1, 768))

RuntimeError: shape '[2, 1, 768]' is invalid for input of size 2

In [29]:
y = torch.tensor([e[1] for e in encodings])
label_mappping = {6:0, 0:1, 3:2}
y = y.apply_(lambda x: label_mappping[x])

In [32]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1)

In [35]:
from torch import optim

torch.manual_seed(10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(x_train.unsqueeze(1))

    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_output = model(x_val.unsqueeze(1))  # val_labels
        val_loss = criterion(val_output, y_val)
        print(epoch, ": ", val_loss)

0 :  tensor(2.8556)
1 :  tensor(0.6247)
2 :  tensor(0.5391)
3 :  tensor(1.0249)
4 :  tensor(0.9489)
5 :  tensor(0.6490)
6 :  tensor(0.5006)
7 :  tensor(0.4826)
8 :  tensor(0.5102)
9 :  tensor(0.4913)
10 :  tensor(0.4503)
11 :  tensor(0.4404)
12 :  tensor(0.4538)
13 :  tensor(0.4617)
14 :  tensor(0.4544)
15 :  tensor(0.4465)
16 :  tensor(0.4418)
17 :  tensor(0.4361)
18 :  tensor(0.4262)
19 :  tensor(0.4197)
20 :  tensor(0.4203)
21 :  tensor(0.4252)
22 :  tensor(0.4209)
23 :  tensor(0.4100)
24 :  tensor(0.4027)
25 :  tensor(0.4010)
26 :  tensor(0.4034)
27 :  tensor(0.4039)
28 :  tensor(0.3979)
29 :  tensor(0.3915)
30 :  tensor(0.3887)
31 :  tensor(0.3883)
32 :  tensor(0.3881)
33 :  tensor(0.3830)
34 :  tensor(0.3756)
35 :  tensor(0.3752)
36 :  tensor(0.3774)
37 :  tensor(0.3720)
38 :  tensor(0.3665)
39 :  tensor(0.3670)
40 :  tensor(0.3677)
41 :  tensor(0.3615)
42 :  tensor(0.3566)
43 :  tensor(0.3571)
44 :  tensor(0.3529)
45 :  tensor(0.3477)
46 :  tensor(0.3493)
47 :  tensor(0.3435)
48

In [26]:
y_act = y.copy()

tensor([0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,
        0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3,