In [1]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle
import math

In [2]:
import sys
from collections import Counter
import pickle

In [3]:
sys.path.append('../')

from enroll import load_model
from SR_Dataset import ToTensorInput, ToTensorTestInput

In [4]:
def compute_accuracy(predictions, labels):
    
    with open('/cas/DeepLearn/elperu/tmp/speech_datasets/SpeakerRecognition_tutorial/model_saved/spk_to_idx.pkl', 'rb') as f:
        spk_to_index = pickle.load(f)
        
    # Invert the order
    index_to_spk = {y:x for x,y in spk_to_index.items()}
        
    predictions_as_speakers = []
    acc = 0
    for pred, lb in zip(predictions, labels):
        idx = spk_to_index[lb]
        
        predictions_as_speakers.append(index_to_spk[pred])
        
        if pred == idx:
            acc += 1
            
    return acc / len(predictions), predictions_as_speakers

In [5]:
def to_tesnor(inp):
    t = torch.Tensor(inp)
    tens_new = t[(None,)*2]
    
    return tens_new.permute(0, 1, 3, 2)

In [6]:
def load_embeddings(path):
    speakers = os.listdir(path)
    
    features = []
    labels = []
    for speaker in speakers:
        files = os.listdir(os.path.join(path, speaker))
        
        for file in files:
            with open(os.path.join(path, speaker, file), 'rb') as f:
                feat_and_lab = pickle.load(f)
                
                feat = feat_and_lab['feat']
                lb = feat_and_lab['label']
                features.append(torch.Tensor(feat))
                labels.append(lb)
                
    return features, labels
        

In [7]:
def get_embeddings(input, model):

    tot_segments = math.ceil(len(input)/test_frames) # total number of segments with 'test_frames' 
    predictions = []
    
    with torch.no_grad():
        for i in range(tot_segments):
            # Divide the input in sub_inputs of length test_frames
            temp_input = input[i*test_frames:i*test_frames+test_frames]
            
            temp_input = to_tesnor(temp_input) # size:(1, 1, n_dims, n_frames)

            if use_cuda:
                temp_input = temp_input.cuda()
                
                
            _, output = model(temp_input) # out size:(batch size, #classes), for softmax

            # calculate accuracy of predictions in the current batch
            pred =  torch.argmax(output, 1).cpu().numpy()[0]
            predictions.append(pred)

    tmp = Counter(predictions)
    final_prediction = tmp.most_common()[0][0]
                
    return final_prediction, predictions

In [8]:
def get_model(use_cuda, log_dir, embedding_size, cp_num, n_classes, test_frames):

    model = load_model(use_cuda, log_dir, cp_num, embedding_size, n_classes)
    
    return model

    

In [9]:
def main(data, model):
    
    preds = []
    for x_i in data:
        
        most_common, all_preds = get_embeddings(x_i, model)
        
        preds.append(most_common)
        
    
    return preds
        
    

In [10]:
# HParams

use_cuda = True
log_dir = '/cas/DeepLearn/elperu/tmp/speech_datasets/SpeakerRecognition_tutorial/model_saved/'
embedding_size = 128
cp_num = 50 
n_classes = 200
test_frames = 200
data_path = '/cas/DeepLearn/elperu/tmp/speech_datasets/LibriSpeech/train_test_split/identification_test/'

In [11]:
# Load model and embeddings
model = get_model(use_cuda, log_dir, embedding_size, cp_num, n_classes, test_frames)

data, labels = load_embeddings(data_path)

=> loading checkpoint


In [12]:
predictions = main(data, model)

In [13]:
acc, pred_as_spks = compute_accuracy(predictions, labels)

print(f'Accuracy of identification system: {acc:.3f}%')

Accuracy of identification system: 0.948%
