In [59]:
import torch.nn as nn
import pickle
import torch
import torchvision.models as models
import torch.nn.functional as F
from config import trained_model_dir, device, pickle_file
from os import path
from models.attention import DotProductAttention
from utils import parse_args, extract_feature_train
from tqdm import tqdm
import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:
models_ids_lst = ["1650981494_cnnattend_soft_random_data", "1651066062_cnnattend_soft_random_algo", 
                  "1651066062_cnnattend_soft_random_algo", "1651066000_cnnattend_soft_random_algo",
                  "1651066000_cnnattend_soft_random_algo"]

In [3]:
def pad(feature):
    max_input_len = 800
    input_length = feature.shape[0]
    input_dim = feature.shape[1]
    padded_input = np.zeros((max_input_len, input_dim), dtype=np.float32)
    length = min(input_length, max_input_len)
    padded_input[:length, :] = feature[:length, :]

    padded_input = np.transpose(padded_input, (1, 0))

    return padded_input, input_length

In [4]:
args = parse_args
with open(pickle_file, "rb") as f:
    data = pickle.load(f)
VOCAB = data["VOCAB_soft"]
samples = data["dev"]
num_samples = len(samples)

sample = samples[0] 
wave = sample["wave"]
key = os.path.basename(wave).split(".")[0]
# print(key)
gt_trn = [i for i in sample["trn"] if i in VOCAB]
# target_dur = [(start_end, dur, tok) for (start_end, dur, tok) in sample["dur"] if  tok.casefold() in VOCAB]
feature = extract_feature_train(input_file=wave, feature='mfcc', dim=13, cmvn=True, delta=True, delta_delta=True)
# feature = (feature - feature.mean()) / feature.std()
padded_input, input_length = pad(feature)
padded_input = torch.from_numpy(padded_input).unsqueeze(0).to(device)
input_length = torch.tensor([input_length]).to(device)

In [5]:
print(padded_input.shape)
print(input_length)

torch.Size([1, 39, 800])
tensor([801], device='cuda:0')


In [6]:
def load_all_models(models_ids_lst):
    all_models = []
    all_conv_feat_model = []
    for model_id in models_ids_lst:
        checkpoint_path = path.join(trained_model_dir, model_id, "BEST_checkpoint.tar")
        checkpoint = torch.load(checkpoint_path, map_location="cpu")
        model = checkpoint["model"].to(device)
        all_conv_feat_model.append(nn.Sequential(*list(model.children()))[0])
        all_models.append(model)       
    return all_models, all_conv_feat_model

def make_trainable_false(all_models, all_conv_feat):
    for model in all_models:
        model.eval()
        for param in model.parameters():
            param.requires_grad = False
    for model in all_conv_feat:
        model.eval()
        for param in model.parameters():
            param.requires_grad = False
    return all_models, all_conv_feat

In [7]:
all_models_trainable, all_conv_feat_model_trainable = load_all_models(models_ids_lst)
all_models_not_trainable, all_conv_feat_model_not_trainable = make_trainable_false(all_models_trainable, all_conv_feat_model_trainable)



In [8]:
all_conv_feat_model_not_trainable[0](padded_input).shape

torch.Size([1, 1000, 800])

In [9]:
all_models_not_trainable[0](padded_input)[0].shape

torch.Size([67])

In [10]:
all_models_not_trainable[0](padded_input)[1].shape

torch.Size([1, 67, 800])

In [11]:
# model = MyEnsemble(all_models_not_trainable)

In [34]:
# class EnsembleCNNAttend(nn.Module):

#     def __init__(self, vocab_size, embed_size, all_models, all_conv_feat_models):
#         super(EnsembleCNNAttend, self).__init__()

#         # Ensemble inputs
#         self.input_layers = get_multi_headed_input(all_models)
#         # Full model
#         self.all_models = all_models
#         # Convolutional module
#         self.all_conv_feat_models = all_conv_feat_models

#         # Embedding module
#         self.embed = embed_queries(embed_size, vocab_size)
        
#         # Attention module
#         self.attention_module = DotProductAttention()
        
        
        
#         # MLP module
#         self.mlp_module = nn.Sequential(
#             nn.Linear(1000, 512),
#             nn.ReLU(),
#             nn.Linear(512, 1),
#         )
    
#     def forward(self, x):
#         print('x: ', x.shape)
#         ensemble_inputs = [input_layer(x) for input_layer in self.input_layers]
#         ensemble_outputs = [model(x) for model in self.all_models]
#         ensemble_conv_feats = [conv_feat_model(x) for conv_feat_model in self.all_conv_feat_models]
#         detect_out = [out[0] for out in ensemble_outputs]
#         out = torch.cat(detect_out) #5x67
#         conv_feats = torch.cat(ensemble_conv_feats) #5x1000x800
#         print("Conv_feat shape: ", conv_feats.shape)
#         print('embed shape: ', self.embed.shape)
        
#         context_vector, attention_weights = self.attention_module(self.embed.cuda(), conv_feats)
#         print('context_vector shape', context_vector.shape)
# # #         context_vector = torch.flatten(context_vector)
# #         context_vector = self.dense_1(context_vector)
# #         context_vector = F.relu(context_vector)
# #         output = self.dense_2(context_vector)
#         output = self.mlp_module(context_vector)
#         output = torch.mean(output, dim = 0).squeeze()
#         attention_weights = torch.mean(attention_weights, dim = 0)
#         print("Output: ", output.shape)
#         print("attention_weights: ", attention_weights.shape)
#         return output, attention_weights

# def embed_queries(embed_size, vocab_size):

#     q_embed = torch.zeros(vocab_size, embed_size)
#     embeddings = nn.Embedding(vocab_size, embed_size)
#     for i in range(vocab_size):
#         lookup_tensor = torch.tensor([i], dtype=torch.long)
#         embed = embeddings(lookup_tensor)
        
#         q_embed[i, :] = embed
        
#     return q_embed

# def get_multi_headed_input(all_models):
    
#     ensemble_models = [nn.Sequential(*list(model.children())[0]) for model in all_models]
#     ensemble_inputs = [ensemble_input[0] for ensemble_input in ensemble_models]
    
#     return ensemble_inputs


In [51]:
class EnsembleCNNAttend(nn.Module):

    def __init__(self, vocab_size, embed_size, all_models, all_conv_feat_models):
        super(EnsembleCNNAttend, self).__init__()

        # Ensemble inputs
        self.input_layers = get_multi_headed_input(all_models)
        # Full model
        self.all_models = all_models
        # Convolutional module
        self.all_conv_feat_models = all_conv_feat_models

        # Embedding module
        self.embed = embed_queries(embed_size, vocab_size)
        
        # Attention module
        self.attention_module = DotProductAttention()
        
        
        
        # MLP module
        self.mlp_module = nn.Sequential(
            nn.Linear(335, 512),
            nn.ReLU(),
            nn.Linear(512, 67),
        )
    
    def forward(self, x):
        print('x: ', x.shape)
        ensemble_inputs = [input_layer(x) for input_layer in self.input_layers]
        ensemble_outputs = [model(x) for model in self.all_models]
        ensemble_conv_feats = [conv_feat_model(x) for conv_feat_model in self.all_conv_feat_models]
        detect_out = [out[0] for out in ensemble_outputs]
        out = torch.cat(detect_out) #5x67
        conv_feats = torch.cat(ensemble_conv_feats) #5x1000x800
        print("Conv_feat shape: ", conv_feats.shape)
        print('embed shape: ', self.embed.shape)
        
        context_vector, attention_weights = self.attention_module(self.embed.cuda(), conv_feats)
        print('context_vector shape', context_vector.shape)

        output = self.mlp_module(out)
#         output = torch.mean(output, dim = 0).squeeze()
        attention_weights = torch.mean(attention_weights, dim = 0)
        print("Output: ", output.shape)
        print("attention_weights: ", attention_weights.shape)
        return output, attention_weights

def embed_queries(embed_size, vocab_size):

    q_embed = torch.zeros(vocab_size, embed_size)
    embeddings = nn.Embedding(vocab_size, embed_size)
    for i in range(vocab_size):
        lookup_tensor = torch.tensor([i], dtype=torch.long)
        embed = embeddings(lookup_tensor)
        
        q_embed[i, :] = embed
        
    return q_embed

def get_multi_headed_input(all_models):
    
    ensemble_models = [nn.Sequential(*list(model.children())[0]) for model in all_models]
    ensemble_inputs = [ensemble_input[0] for ensemble_input in ensemble_models]
    
    return ensemble_inputs


In [52]:
model = EnsembleCNNAttend(67, 1000, all_models_not_trainable, all_conv_feat_model_not_trainable).to(device)

In [54]:
out, attention_weight = model(padded_input)

x:  torch.Size([1, 39, 800])
Conv_feat shape:  torch.Size([5, 1000, 800])
embed shape:  torch.Size([67, 1000])
context_vector shape torch.Size([5, 67, 1000])
Output:  torch.Size([67])
attention_weights:  torch.Size([67, 800])


In [57]:
F.sigmoid(out)

tensor([0.3983, 0.5787, 0.3388, 0.6601, 0.5761, 0.4613, 0.6132, 0.1628, 0.6192,
        0.7172, 0.2977, 0.2564, 0.0983, 0.4015, 0.7439, 0.6216, 0.4193, 0.2686,
        0.3893, 0.5928, 0.1900, 0.2723, 0.7151, 0.4519, 0.7079, 0.3063, 0.6007,
        0.4782, 0.5449, 0.8484, 0.7223, 0.1655, 0.6226, 0.5478, 0.3525, 0.4244,
        0.5485, 0.7773, 0.3444, 0.4673, 0.5368, 0.1594, 0.2824, 0.5200, 0.3643,
        0.3572, 0.5015, 0.6881, 0.5866, 0.6449, 0.4972, 0.7838, 0.2931, 0.1076,
        0.5564, 0.7423, 0.5796, 0.1802, 0.4626, 0.5411, 0.1661, 0.6574, 0.3553,
        0.6071, 0.1497, 0.2439, 0.5995], device='cuda:0',
       grad_fn=<SigmoidBackward>)

In [63]:
# plt.plot(attention_weight.detach().cpu().numpy())

In [148]:
torch.cat([x(padded_input) for x in all_conv_feat_model_not_trainable])[0]

tensor([[-0.1722, -0.1963, -0.2076,  ..., -0.0741, -0.0669, -0.0509],
        [ 0.0045, -0.0409, -0.0283,  ..., -0.0427, -0.0281, -0.0163],
        [-0.0676, -0.0577, -0.0653,  ...,  0.0078,  0.0222,  0.0196],
        ...,
        [-0.0162, -0.0669, -0.0678,  ..., -0.0022, -0.0048, -0.0231],
        [ 0.0682,  0.0870,  0.0719,  ...,  0.0424,  0.0407,  0.0604],
        [ 0.0767, -0.0054, -0.0587,  ..., -0.0135,  0.0382,  0.0683]],
       device='cuda:0')

In [140]:
all_conv_feat_model_not_trainable[0]

Sequential(
  (0): Conv1d(39, 96, kernel_size=(9,), stride=(1,), padding=(4,))
  (1): LeakyReLU(negative_slope=0.01)
  (2): Conv1d(96, 96, kernel_size=(11,), stride=(1,), padding=(5,))
  (3): ReLU()
  (4): Conv1d(96, 96, kernel_size=(11,), stride=(1,), padding=(5,))
  (5): LeakyReLU(negative_slope=0.01)
  (6): Conv1d(96, 96, kernel_size=(11,), stride=(1,), padding=(5,))
  (7): LeakyReLU(negative_slope=0.01)
  (8): Conv1d(96, 96, kernel_size=(11,), stride=(1,), padding=(5,))
  (9): LeakyReLU(negative_slope=0.01)
  (10): Conv1d(96, 1000, kernel_size=(11,), stride=(1,), padding=(5,))
)

In [133]:
torch.matmul(torch.rand(67, 1000), torch.rand(5, 1000, 800)).shape

torch.Size([5, 67, 800])

In [64]:
# Define multi headed input

def get_multi_headed_input(all_models):
    ensemble_models = [nn.Sequential(*list(model.children())[0]) for model in all_models]
    ensemble_inputs = [ensemble_input[0] for ensemble_input in ensemble_models]
    return ensemble_inputs

def get_model_outputs

In [65]:
ensemble_inputs = get_multi_headed_input(all_models_not_trainable)

In [231]:
for i, (data) in enumerate(train_loader):
    # Move to GPU, if available
    optimizer.zero_grad()
    target = None
    padded_input, bow_target, soft_target, _, input_lengths = data
    # print("padded input: ", padded_input[0][0][:2])
    padded_input = padded_input.to(device)
    input_lengths = input_lengths.to(device)
    if target_type == 'bow':
        target = bow_target.to(device)
        continue
    elif target_type == 'soft':
        target = soft_target.to(device)
    else:
        print("Incorrect supervision's target. Choose either 'bow' or 'soft'.")
        break

    # Forward prop.
    out, attention_Weights = model(padded_input)
    # loss = criterion(torch.sigmoid(out), target)
    # print("Out Shape: ", out.shape)
    # print("target shape: ", target.shape)
    loss = criterion(out, target)

    # Back prop.
    loss.backward(retain_graph=True)

    # update weights
    optimizer.step()

    # Keep track of metrics
    losses.update(loss.item())

    # Print status
    if i % print_freq == 0:
        logger.info('Epoch: [{0}][{1}/{2}]\t'
        'Loss {loss.val:.4f} ({loss.avg:.4f})'.format(epoch, i, len(train_loader), loss=losses))

    return losses.avg


    def valid(valid_loader, model, logger, threshold):
    model.eval()
    losses = AverageMeter()
    n_tp = 0  
    n_tp_fp = 0 # (tp + fp)
    n_tp_fn = 0 # (tp + fn)

    # Create loss function
    # criterion = nn.BCELoss()
    criterion = nn.BCEWithLogitsLoss()

    # Batches
    for data in tqdm(valid_loader):
    # Move to GPU, if available
    padded_input, bow_target, _, __, input_lengths = data
    # padded_input = torch.transpose(padded_input, 2, 1)
    padded_input = padded_input.to(device)
    input_lengths = input_lengths.to(device)
    target = bow_target.to(device)
    # Forward prop.
    out, attention_weights = model(padded_input)
    # loss = criterion(torch.sigmoid(out), target)
    loss = criterion(out, target)

    # Keep track of metrics
    losses.update(loss.item())
    sigmoid_out = torch.sigmoid(out).cpu()
    # sigmoid_out = out.cpu()
    sigmoid_out_thresholded = torch.ge(sigmoid_out, threshold).float()
    n_tp += torch.sum(sigmoid_out_thresholded * target.cpu()).numpy()
    n_tp_fp += torch.sum(sigmoid_out_thresholded).numpy()
    n_tp_fn += torch.sum(target.cpu()).numpy()

    precision = n_tp / n_tp_fp
    recall = n_tp / n_tp_fn
    fscore = 2 * precision * recall / (precision + recall)

    # Print status
    logger.info('\nValidation Loss {loss.val:.4f} ({loss.avg:.4f})\n'.format(loss=losses))
    logger.info('\nValidation Precision: {precision:.4f}\n'.format(precision=precision))
    logger.info('\nValidation Recall: {recall:.4f}\n'.format(recall=recall))
    logger.info('\nValidation F-score: {fscore:.4f}\n'.format(fscore=fscore))

    return losses.avg, precision, recall, fscore