In [2]:
# system
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.autograd import Variable
from torchvision import models

from tqdm import tqdm
from collections import OrderedDict

In [3]:
# extracted feature dateset from mel-spectrogram
from preprocessing import load_ausil_feature_dataset

In [4]:
device_num = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# 1. load extracted features (by pre-trained CNN)

In [5]:
cs, ns, labels = load_ausil_feature_dataset()

  0%|          | 79/32000 [00:00<00:40, 781.86it/s]



100%|██████████| 32000/32000 [00:40<00:00, 786.75it/s]


In [5]:
print(cs[0])

[[0.2972077  0.49398693 0.         ... 0.         0.0179748  0.        ]]


In [6]:
# bsz * 10 개만 할당
small_size = 320
cs = torch.tensor(cs[:small_size])
ns = torch.tensor(ns[:small_size])
labels = torch.tensor(labels[:small_size])

#cs = cs.view(-1, 1, last_pad_length, num_feature)
#ns = ns.view(-1, 1, last_pad_length, num_feature)

cs.size(), ns.size()

(torch.Size([320, 8, 2528]), torch.Size([320, 8, 2528]))

# 2.PCA and attention

In [7]:
# class PCA_layer(object):
#     def __init__(self, dims=2528):
#         with tf.variable_scope('PCA'):
#             self.mean = tf.get_variable('mean_sift', dtype=tf.float32, trainable=False, shape=(dims,) )
#             self.weights = tf.get_variable('weights', dtype=tf.float32, trainable=False, shape=(dims,dims))

#     def __call__(self, logits):
#         logits = logits - self.mean
#         logits = tf.tensordot(logits, self.weights, axes=1)
#         return logits

class PCA_layer(nn.Module):
    def __init__(self, dims=2528):
        super(PCA_layer, self).__init__()        
        self.mean = Variable(torch.randn(dims).type(torch.FloatTensor), requires_grad=False)
        self.weights = Variable(torch.randn(dims, dims).type(torch.FloatTensor), requires_grad=False)
        
    def foward(self, logits):
        logits = logits - self.mean
        logits = tensordot(logits, self.weights, dims=2)
        return logits
    
# class Attention_layer(object):

#     def __init__(self, dims=2528):
#         with tf.variable_scope('attention_layer'):
#             self.context_vector = tf.get_variable('context_vector', dtype=tf.float32,
#                                                   trainable=False, shape=(dims, 1))

#     def __call__(self, logits):
#         weights = tf.tensordot(logits, self.context_vector, axes=1) / 2.0 + 0.5
#         return tf.multiply(logits, weights), weights

class Attention_layer(nn.Module):
    def __init__(self, dims=2528):
        super(Attention_layer, self).__init__()
        self.context_vector = Variable(torch.randn(dims, 1).type(torch.FloatTensor), requires_grad=False)
        
    def foward(self, logits):
        weights = tensordot(logits, self.context_vector, dims=1) / 2.0 + 0.5
        return multiply(logits, weights), weights

# 3. AuSiL

In [8]:
class AuSiL(object):
    def __init__(self, model_dir):
        super(AuSiL, self).__init__()
        
        # befor ausil PCA and attention
        self.pca_layer = PCA_layer()
        self.att_layer = Attention_layer()
        
        # ausil
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1)
        self.fconv = nn.Conv2d(in_channels=128, out_channels=1, kernel_size=1, stride=1)
                    
    def foward(self, cs, ns, labels):
        # PCA & self-attention
        cs_emb = self.extract_features(cs)
        ns_emb = self.extract_features(ns)
        
        # Sim Matrix
        sim = torch.matmul(self.cs_emb, torch.transpose(self.candidate))
        
        # AuSiL
        sim = F.max_pool2d(F.relu(self.conv1(sim)), 2, stride=2)
        #pad
        sim = F.max_pool2d(F.relu(self.conv2(sim)), 2, stride=2)
        sim = F.relu(self.conv3(sim))
        sim = self.fconv(sim)
        
        sim = torch.clamp(a, min=-1, max=1)
        
        # similarity
        sim = self.chamfer_similarity(sim)
        
    def extract_features(self, features):
        # PCA
        features = F.normalize(features,dim=-1,p=2)
        features = self.pca_layer(features)
        features = F.normalize(features,dim=-1,p=2)
        # Attention
        features, weights = self.att_layer(features)
        return features
    
    def chamfer_similarity(self, sim, max_axis=1, mean_axis=0):
        sim = torch.max(sim, max_axis, keepdim=True)
        sim = torch.mean(sim, mean_axis, keepdim=True)
        # tf.squeeze(sim, [max_axis, mean_axis])
        sim = torch.squeeze(sim, 0)
        sim = torch.squeeze(sim, 0)
        return sim