# **Problem 1: Speaker Verification**

In [None]:
import pickle
import librosa
import random
import pandas as pd
import numpy as np
import torch
import itertools
import torch.nn as nn
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
from torch import nn, optim
import torch.nn.functional as F
from torchvision.datasets import MNIST
import torch.utils.data as utils
from torch.utils.data import TensorDataset, DataLoader

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
train_files = pickle.load(open('/content/drive/MyDrive/Colab Notebooks/HW4/hw4_trs.pkl', 'rb'))
test_files = pickle.load(open('/content/drive/MyDrive/Colab Notebooks/HW4/hw4_tes.pkl', 'rb'))

# print shape
print(train_files.shape, test_files.shape)

(500, 16180) (200, 22631)


In [None]:
# converting train data into stft
train_stft = []
for i in range(len(train_files)):
    converted = np.abs(librosa.stft(train_files[i], n_fft=1024, hop_length=512).T)
    train_stft.append(converted)

# print shape
train_stft = np.array(train_stft)
print(train_stft.shape)

(500, 32, 513)


In [None]:
# converting test data into stft
test_stft = []
for i in range(len(test_files)):
    converted = np.abs(librosa.stft(test_files[i], n_fft=1024, hop_length=512).T)
    test_stft.append(converted)

# print shape
test_stft = np.array(test_stft)
print(test_stft.shape)

(200, 45, 513)


In [None]:
# spliting train and test based on the utterances of 50 people (20 for test)
split_train = np.split(train_stft, 50)
split_test = np.split(test_stft, 20)

# print shape
print(len(split_train), split_train[0].shape)
print(len(split_test), split_test[0].shape)

50 (10, 32, 513)
20 (10, 45, 513)


In [None]:
# training data
positive_examples = []
negative_examples = []
for i in range(len(split_train)):
    # getting utterance of an individual speaker
    single_speaker = split_train[i]

    # getting combinations of utterance for the individual speaker
    positive = list(itertools.combinations(single_speaker, 2))
    L = len(positive)
    positive = np.array(positive)
    positive_examples.append(positive)

    utter490 = np.concatenate((train_stft[0:10 * i], train_stft[(10 + i*10):]))
    utt1 = utter490[np.random.choice(utter490.shape[0], L, replace=True)]
    utt2 = single_speaker[np.random.choice(single_speaker.shape[0], L, replace=True)]
    # print(utt1.shape, utt2.shape)
    
    temp = []
    for j in range(len(utt1)):
        setter = (utt1[j], utt2[j])
        temp.append(setter)
    temp = np.array(temp)
    negative_examples.append(temp)

# print length
print("shape of positive-negative examples")
print(len(positive_examples), len(negative_examples))
print(positive_examples[0].shape, negative_examples[0].shape)

train_list = []
for i in range(len(positive_examples)):
    temp = np.concatenate((positive_examples[i], negative_examples[i]), axis=0)
    for j in range(len(temp)):
        train_list.append(temp[j])

print("\nshape of train_list")
train_list = np.array(train_list)
print(train_list.shape)

shape of positive-negative examples
50 50
(45, 2, 32, 513) (45, 2, 32, 513)

shape of train_list
(4500, 2, 32, 513)


In [None]:
# testing data
positive_test = []
negative_test = []
for i in range(len(split_test)):
    # getting utterance of an individual speaker
    single_speaker = split_test[i]

    # getting combinations of utterance for the individual speaker
    positive = list(itertools.combinations(single_speaker, 2))
    L = len(positive)
    positive = np.array(positive)
    positive_test.append(positive)

    utter180 = np.concatenate((test_stft[0:10 * i], test_stft[(10 + i*10):]))
    utt1 = utter180[np.random.choice(utter180.shape[0], L, replace=True)]
    utt2 = single_speaker[np.random.choice(single_speaker.shape[0], L, replace=True)]
    # print(utt1.shape, utt2.shape)
    
    temp = []
    for j in range(len(utt1)):
        setter = (utt1[j], utt2[j])
        temp.append(setter)
    temp = np.array(temp)
    negative_test.append(temp)

# print length
print("shape of positive-negative examples")
print(len(positive_test), len(negative_test))
print(positive_test[0].shape, negative_test[0].shape)

test_list = []
for i in range(len(positive_test)):
    temp = np.concatenate((positive_test[i], negative_test[i]), axis=0)
    for j in range(len(temp)):
        test_list.append(temp[j])

print("\nshape of test_list")
test_list = np.array(test_list)
print(test_list.shape)

shape of positive-negative examples
20 20
(45, 2, 45, 513) (45, 2, 45, 513)

shape of test_list
(1800, 2, 45, 513)


In [None]:
pos_labels = torch.ones((L))
neg_labels = torch.zeros((L))
labs = torch.cat([pos_labels, neg_labels], dim=0)
print(labs.shape)

# train labels
train_labels = torch.Tensor([])
for i in range(len(positive_examples)):
    train_labels = torch.cat([train_labels, labs], dim=0)
print(train_labels.shape)

# test labels
test_labels = torch.Tensor([])
for i in range(len(positive_test)):
    test_labels = torch.cat([test_labels, labs], dim=0)
print(test_labels.shape)

torch.Size([90])
torch.Size([4500])
torch.Size([1800])


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# create a data loader for train data
trainSet = utils.TensorDataset(torch.Tensor(train_list).to(device), torch.Tensor(train_labels).to(device))
train_loader = utils.DataLoader(trainSet, batch_size=30, shuffle=True)

print(len(train_loader))
a, b = next(iter(train_loader))
print(a.shape, b.shape)

150
torch.Size([30, 2, 32, 513]) torch.Size([30])


In [None]:
# create a data loader for test data
testSet = utils.TensorDataset(torch.Tensor(test_list).to(device), test_labels.to(device))
test_loader = utils.DataLoader(testSet, batch_size=20, shuffle=True)

print(len(test_loader))
a, b = next(iter(test_loader))
print(a.shape, b.shape)

90
torch.Size([20, 2, 45, 513]) torch.Size([20])


In [None]:
class SiameseNet(nn.Module):
    def __init__(self):
        super(SiameseNet, self).__init__()
        self.lstm  = nn.LSTM(input_size=513, hidden_size=513, num_layers=2, batch_first=True)
        self.fc1 = nn.Linear(in_features=513, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=128)
        self.fc3 = nn.Linear(in_features=256, out_features=50)


    def forward_once(self, x):
          # Forward pass 
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

    def forward(self, ip1, ip2):
        # passing input 1 into the network
        out1 = self.forward_once(ip1)
        # passing input 2 into the network
        out2 = self.forward_once(ip2)
        return out1, out2

net = SiameseNet()
net.to(device)
err_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(net.parameters(), lr=3e-5)
print(net)

SiameseNet(
  (lstm): LSTM(513, 513, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=513, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=50, bias=True)
)


In the previous assignment we used LSTMs to denoise the audios, Since the results were really good, I decided to go with the LSTM for this assignment as well.

In [None]:
epochs = 100
accuracy_list = []
for e in range(1, epochs + 1):
    train_loss = 0
    correct = 0
    for data, target in train_loader:
        ip1 = data[:, 0]
        ip2 = data[:, 1]
        
        optimizer.zero_grad()

        out1, out2 = net(ip1, ip2)
        out1 = out1.reshape(out1.shape[0], 1, -1)
        out2 = out2.reshape(out2.shape[0], 1, -1)
        result = torch.flatten(torch.bmm(out1, out2.permute(0, 2, 1)))

        temp_list = []
        for r in result:
            if torch.sigmoid(r) >= 0.5:
                temp_list.append(1)
            else:
                temp_list.append(0)

        temp_list = torch.Tensor(temp_list).to(device)
        correct += (temp_list == target).sum().item()

        loss = err_func(result, target)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

    acc = correct/len(train_loader.dataset) * 100
    accuracy_list.append(acc)
    if e%5 == 0:
        print("EPOCH:{eps}   |   train_loss:{tls}   |   acc:{accu}    |   correct:{corr}".
              format(eps = e, tls=round(train_loss/len(train_loader),4), accu=round(acc,3), corr=correct))

EPOCH:5   |   train_loss:0.5504   |   acc:74.267    |   correct:3342
EPOCH:10   |   train_loss:0.5068   |   acc:77.556    |   correct:3490
EPOCH:15   |   train_loss:0.4721   |   acc:80.467    |   correct:3621
EPOCH:20   |   train_loss:0.4227   |   acc:82.111    |   correct:3695
EPOCH:25   |   train_loss:0.3794   |   acc:83.889    |   correct:3775
EPOCH:30   |   train_loss:0.3251   |   acc:86.644    |   correct:3899
EPOCH:35   |   train_loss:0.2937   |   acc:88.111    |   correct:3965
EPOCH:40   |   train_loss:0.2405   |   acc:90.111    |   correct:4055
EPOCH:45   |   train_loss:0.2042   |   acc:91.467    |   correct:4116
EPOCH:50   |   train_loss:0.1749   |   acc:93.133    |   correct:4191
EPOCH:55   |   train_loss:0.1382   |   acc:94.578    |   correct:4256
EPOCH:60   |   train_loss:0.1146   |   acc:95.422    |   correct:4294
EPOCH:65   |   train_loss:0.0916   |   acc:96.244    |   correct:4331
EPOCH:70   |   train_loss:0.0637   |   acc:97.489    |   correct:4387
EPOCH:75   |   train_

In [None]:
# running_loss = 0
correct = 0
with torch.no_grad():
    net.eval()
    for data, target in test_loader:
        ip1 = data[:, 0]
        ip2 = data[:, 1]

        out1, out2 = net(ip1, ip2)
        out1 = out1.reshape(out1.shape[0], 1, -1)
        out2 = out2.reshape(out2.shape[0], 1, -1)
        result = torch.flatten(torch.bmm(out1, out2.permute(0, 2, 1)))

        temp_list = []
        for each in result:
            if torch.sigmoid(each) > 0.5:
                temp_list.append(1)
            else:
                temp_list.append(0)

        temp_list = torch.Tensor(temp_list).to(device)
        correct += (temp_list == target).sum().item()

print("Test Accuracy: {acc}".format(acc= round((correct / len(test_loader.dataset))*100, 3)))

Test Accuracy: 69.667


After trying out various archtecture, tuning various hyperparameters, this was the best results I got. No matter what, my accuracy did not go above 70. 