In [None]:
%matplotlib inline

In [None]:
!pip uninstall torch torchtext torchvision
!pip install torch==1.7.0
!pip install torchaudio==0.7.0

Uninstalling torch-1.7.0:
  Would remove:
    /usr/local/bin/convert-caffe2-to-onnx
    /usr/local/bin/convert-onnx-to-caffe2
    /usr/local/lib/python3.7/dist-packages/caffe2/*
    /usr/local/lib/python3.7/dist-packages/torch-1.7.0.dist-info/*
    /usr/local/lib/python3.7/dist-packages/torch/*
Proceed (y/n)? y
  Successfully uninstalled torch-1.7.0
Uninstalling torchtext-0.9.1:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/torchtext-0.9.1.dist-info/*
    /usr/local/lib/python3.7/dist-packages/torchtext/*
Proceed (y/n)? y
  Successfully uninstalled torchtext-0.9.1
Uninstalling torchvision-0.9.1+cu101:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/torchvision-0.9.1+cu101.dist-info/*
    /usr/local/lib/python3.7/dist-packages/torchvision.libs/libcudart.c740f4ef.so.10.1
    /usr/local/lib/python3.7/dist-packages/torchvision.libs/libjpeg.ceea7512.so.62
    /usr/local/lib/python3.7/dist-packages/torchvision.libs/libpng16.7f72a3c5.so.16
    /usr/local/lib/python3.7/



# 다 직접 구현한 버전

* [https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html](https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html)
*[https://github.com/pytorch/fairseq/blob/c36294ea4fd35eac757f417de9668b32c57d4b3d/fairseq/modules/vggblock.py#L22](https://github.com/pytorch/fairseq/blob/c36294ea4fd35eac757f417de9668b32c57d4b3d/fairseq/modules/vggblock.py#L22)

## ConvolutionEncoder
* positional embedding 대신에 conv layer를 거친 입력을 사용한다.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# def infer_conv_output_dim(conv_op, input_dim, sample_inchannel):
#     sample_seq_len = 200
#     sample_bsz = 10
#     x = torch.randn(sample_bsz, sample_inchannel, sample_seq_len, input_dim)
#     # N x C x H x W
#     # N: sample_bsz, C: sample_inchannel, H: sample_seq_len, W: input_dim
#     x = conv_op(x)
#     # N x C x H x W
#     x = x.transpose(1, 2)
#     # N x H x C x W
#     bsz, seq = x.size()[:2]
#     per_channel_dim = x.size()[3]
#     # bsz: N, seq: H, CxW the rest
#     return x.contiguous().view(bsz, seq, -1).size(-1), per_channel_dim

'''
Two 2-D convolutional blocks, each with two conv. layers with kernel size=3, max-pooling kernel=2. The first block has 64 feature maps while the second has 128
'''
class ConvEncoder(nn.Module):
    def __init__(self, 
                 in_channels, 
                 out_channels,
                 conv_kernel_size,
                 pooling_kernel_size,
                #  input_dim, 
                 conv_stride=1,
                 padding=1,
                 layer_norm=True
                 ):
        super().__init__()
        self.layers = nn.ModuleList()

        # input_channels=1 : mfcc는 2d이므로 채널을 1로 봐야함.
        # kernel size 3
        conv_op = nn.Conv2d(in_channels=in_channels, out_channels=int(out_channels/2), kernel_size=conv_kernel_size, padding=padding)
        self.layers.append(conv_op)
        if layer_norm:
            self.layers.append(nn.LayerNorm(out_channels))
        self.layers.append(nn.ReLU())

        conv_op = nn.Conv2d(in_channels=int(out_channels/2), out_channels=out_channels, kernel_size=conv_kernel_size, padding=padding)
        self.layers.append(conv_op)
        if layer_norm:
            self.layers.append(nn.LayerNorm(out_channels))
        self.layers.append(nn.ReLU())

        # max-pooling kernel=2
        # ceil_mode : when True, will use ceil instead of floor to compute the output shape
        self.maxpool = nn.MaxPool2d(kernel_size=pooling_kernel_size, ceil_mode=True)
        
    def forward(self, x):
        for i, _ in enumerate(self.layers):
            x = self.layers[i](x)
        return x
    

In [None]:
ConvEncoder(in_channels=1, 
            out_channels=64,
            conv_kernel_size=3,
            pooling_kernel_size=2
        #  ,input_dim=30
            )

ConvEncoder(
  (layers): ModuleList(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (5): ReLU()
  )
  (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
)

## SelfAttentionEncoder

In [None]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

In [None]:
class MultiheadAttention(nn.Module):

    def __init__(self, input_dim, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be 0 modulo number of heads."

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Stack all weight matrices 1...h together for efficiency
        # Note that in many implementations you see "bias=False" which is optional
        self.qkv_proj = nn.Linear(input_dim, 3*embed_dim)
        self.o_proj = nn.Linear(embed_dim, embed_dim)

        self._reset_parameters()


    def _reset_parameters(self):
        # Original Transformer initialization, see PyTorch documentation
        nn.init.xavier_uniform_(self.qkv_proj.weight)
        self.qkv_proj.bias.data.fill_(0)
        nn.init.xavier_uniform_(self.o_proj.weight)
        self.o_proj.bias.data.fill_(0)


    def forward(self, x, mask=None, return_attention=False):
        batch_size, seq_length, embed_dim = x.size()
        qkv = self.qkv_proj(x)

        # Separate Q, K, V from linear output
        qkv = qkv.reshape(batch_size, seq_length, self.num_heads, 3*self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3) # [Batch, Head, SeqLen, Dims]
        q, k, v = qkv.chunk(3, dim=-1)

        # Determine value outputs
        values, attention = scaled_dot_product(q, k, v, mask=mask)
        values = values.permute(0, 2, 1, 3) # [Batch, SeqLen, Head, Dims]
        values = values.reshape(batch_size, seq_length, embed_dim)
        o = self.o_proj(values)

        if return_attention:
            return o, attention
        else:
            return o

In [None]:
MultiheadAttention(input_dim=200
                   ,embed_dim=512
                   , num_heads=8)

MultiheadAttention(
  (qkv_proj): Linear(in_features=200, out_features=1536, bias=True)
  (o_proj): Linear(in_features=512, out_features=512, bias=True)
)

In [None]:
'''
(2) 10 encoder transformer blocks all with transformer dim=1024, 16 heads, intermediate ReLU layer size=2048,
'''
class SelfAttentionEncoder(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads=8, dropout=0.15):
        super().__init__()
        # input_dim - Dimensionality of the input
        self.mha = MultiheadAttention(input_dim, embed_dim, num_heads)
        self.dropout = nn.Dropout(dropout)

        self.layerNorm1 = nn.LayerNorm(embed_dim) # 앞선 것과 같은 dim
        self.layerNorm2 = nn.LayerNorm(embed_dim) # 앞선 것과 같은 dim
        
        self.fc = nn.Linear(embed_dim, embed_dim)

        self.relu = nn.ReLU(inplace=True)

        
    def forward(self, x):

        x1 = self.mha(x)
        x1 = self.dropout(x1)

        x = torch.cat((x, x1), dim=1)
        x = self.layerNorm1(x)

        x2 = self.fc(x)
        x2 = self.relu(x2)
        x2 = self.fc(x)
        x2 = self.dropout(x2)

        x = torch.cat((x, x2), dim=1)
        x = self.layerNorm2(x)

        return x



In [None]:
sae = SelfAttentionEncoder(input_dim=200, 
                           embed_dim=512)
sae

SelfAttentionEncoder(
  (mha): MultiheadAttention(
    (qkv_proj): Linear(in_features=200, out_features=1536, bias=True)
    (o_proj): Linear(in_features=512, out_features=512, bias=True)
  )
  (dropout): Dropout(p=0.15, inplace=False)
  (layerNorm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (layerNorm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (fc): Linear(in_features=512, out_features=512, bias=True)
  (relu): ReLU(inplace=True)
)

## AudioEncoder 최종

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from typing import Any, Dict, Optional, Tuple, Union

In [None]:
class BasicAttention(nn.Module):
    """
    Implements simple/classical attention.
    """

    def __init__(
        self,
        dim: int = 1,
        attn: str = 'cosine',
        residual: bool = False,
        get_weights: bool = True,
    ):
        super().__init__()
        if attn == 'cosine':
            self.cosine = nn.CosineSimilarity(dim=dim)
        self.attn = attn
        self.dim = dim
        self.get_weights = get_weights
        self.residual = residual

    def forward(
        self,
        xs: torch.Tensor,
        ys: torch.Tensor,
        mask_ys: Optional[torch.Tensor] = None,
        values: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Compute attention.
        Attend over ys with query xs to obtain weights, then apply weights to
        values (ys if yalues is None)
        Args:
            xs: B x query_len x dim (queries)
            ys: B x key_len x dim (keys)
            mask_ys: B x key_len (mask)
            values: B x value_len x dim (values); if None, default to ys
        """
        bsz = xs.size(0)
        y_len = ys.size(1)
        x_len = xs.size(1)
        if self.attn == 'cosine':
            l1 = self.cosine(xs, ys).unsqueeze(self.dim - 1)
        else:
            l1 = torch.bmm(xs, ys.transpose(1, 2))
            if self.attn == 'sqrt':
                d_k = ys.size(-1)
                l1 = l1 / math.sqrt(d_k)
        if mask_ys is not None:
            attn_mask = (mask_ys == 0).view(bsz, 1, y_len)
            attn_mask = attn_mask.repeat(1, x_len, 1)
            l1.masked_fill_(attn_mask, neginf(l1.dtype))
        l2 = F.softmax(l1, dim=self.dim, dtype=torch.float).type_as(l1)
        if values is None:
            values = ys
        lhs_emb = torch.bmm(l2, values)

        # # add back the query
        if self.residual:
            lhs_emb = lhs_emb.add(xs)

        if self.get_weights:
            return lhs_emb.squeeze(self.dim - 1), l2
        else:
            return lhs_emb.squeeze(self.dim - 1)

In [None]:
class PolyBasicAttention(BasicAttention):
    """
    Override basic attention to account for edge case for polyencoder.
    """

    def __init__(self, poly_type, n_codes, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.poly_type = poly_type
        self.n_codes = n_codes

    def forward(self, *args, **kwargs):
        """
        Forward pass.
        Account for accidental dimensionality reduction when num_codes is 1 and the
        polyencoder type is 'codes'
        """
        lhs_emb = super().forward(*args, **kwargs)
        if self.poly_type == 'codes' and self.n_codes == 1 and len(lhs_emb.shape) == 2:
            lhs_emb = lhs_emb.unsqueeze(self.dim - 1)
        return lhs_emb

In [None]:
"""Near infinity, useful as a large penalty for scoring when inf is bad."""
NEAR_INF = 1e20
NEAR_INF_FP16 = 65504

def neginf(dtype: torch.dtype) -> float:
    """
    Return a representable finite number near -inf for a dtype.
    """
    if dtype is torch.float16:
        return -NEAR_INF_FP16
    else:
        return -NEAR_INF

In [None]:
class AudioEncoder(nn.Module):
    def __init__(self,
                 poly_n_codes, # m, the number of global contxt features
                #  poly_attention_type,
                 poly_attention_num_heads,
                #  codes_attention_type,
                 codes_attention_num_heads,
                 embed_dim, 

                 num_conv_layers, 
                 in_channels, 
                 conv_kernel_size=3, 
                 pooling_kernel_size=2,
                 num_att_layers=6, 
                 dropout=0.15):
        super(AudioEncoder, self).__init__()

        self.n_codes = poly_n_codes
        self.attention_num_heads = poly_attention_num_heads
        self.codes_attention_num_heads = codes_attention_num_heads

        # the codes
        codes = torch.empty(self.n_codes, embed_dim)
        codes = torch.nn.init.uniform_(codes)
        self.codes = torch.nn.Parameter(codes)

        # attention for the codes
        self.code_attention = PolyBasicAttention(poly_type='codes', n_codes=self.n_codes, dim=2, attn='basic', get_weights=False)

        # The final attention (the one that takes the candidate as key)
        self.attention = MultiheadAttention(embed_dim, embed_dim, self.attention_num_heads)

        self.encoder = nn.ModuleList()

        # two 2d-conv blocks
        # in_channels=1, 
        # out_channels=64,
        # conv_kernel_size=3,
        # pooling_kernel_size=2,
        for i in range(num_conv_layers):
            self.encoder.append(
                ConvEncoder(
                    in_channels=in_channels, 
                    out_channels=embed_dim,
                    conv_kernel_size=conv_kernel_size,
                    pooling_kernel_size=pooling_kernel_size
                    )
                )
        
        # SelfAttentionEncoder * num_att_layers
        for i in range(num_att_layers):
            self.encoder.append(
                SelfAttentionEncoder(
                    input_dim=embed_dim,
                    embed_dim=embed_dim, 
                    num_heads=self.attention_num_heads, 
                    dropout=dropout)
                )
            
    def attend(self, attention_layer, queries, keys, values, mask):
        """
        Apply attention.
        :param attention_layer:
            nn.Module attention layer to use for the attention
        :param queries:
            the queries for attention
        :param keys:
            the keys for attention
        :param values:
            the values for attention
        :param mask:
            mask for the attention keys
        :return:
            the result of applying attention to the values, with weights computed
            wrt to the queries and keys.
        """
        if keys is None:
            keys = values
        if isinstance(attention_layer, PolyBasicAttention):
            return attention_layer(queries, keys, mask_ys=mask, values=values)
        elif isinstance(attention_layer, MultiHeadAttention):
            return attention_layer(queries, keys, values, mask)[0]
        else:
            raise Exception('Unrecognized type of attention')

                
    def forward(self, x, label=None):
        # x = [ctxt, next]
        # next: candidate
        ctxt, next = x
        
        # cand mfcc를 conv->selfattention encoder를 거친 emb
        cand_emb = self.encoder(next)

        # ctxt mfcc를 conv->self attention encoder를 거친 emb
        ctxt_out = self.encoder(next)

        # m개 만큼 context code를 반복
        # ctxt_out 값과 code를 내적한 값들의 softmax한 벡터 (w_1,...,w_m)를 이전 레이어 결과값(ctxt_out)과 곱해서 합한다.
        # 이 값이 m개의 global context features
        bsz = cand_emb.size(0)
        global_ctxts = self.attend(attention_layer=self.code_attention , 
                                   queries=self.codes.repeat(bsz, 1, 1), 
                                   keys=ctxt_out,
                                   values=ctxt_out, 
                                   mask=None)        

        # m개의 global context features를 cand_emb와 내적한 값을 softmax한 벡터를 (w_1,...,w_m)라 할 때, 이 가중치 값과 global contxt features를 곱해서 합한다.
        # 이 값이 최종 ctxt_emb
        ctxt_emb = self.attend(attention_layer=self.attention ,
                               queries=cand_emb,
                               keys=global_ctxts,
                               values=global_ctxts,
                               mask=None)        

        # score: cand_emb와 ctxt_emb 간 cosine similarity값 (반환값)
        if label is None:
            scores = torch.sum(ctxt_emb * cand_emb, 2)
            return scores
        else:
            # we are recycling responses for faster training
            # we repeat responses for batch_size times to simulate test phase
            # so that every context is paired with batch_size responses
            cand_emb = cand_emb.permute(1, 0, 2) # [1, bs, dim]
            cand_emb = cand_emb.expand(bsz, bsz, cand_emb.shape[2]) # [bs, bs, dim]
            ctxt_emb = self.attend(attention_layer=self.attention, 
                                   queries=cand_emb, 
                                   keys=global_ctxts,
                                   values=global_ctxts,
                                   mask=None)        
            ctxt_emb = (cand_emb, embs, embs).squeeze() # [bs, bs, dim]
            dot_product = torch.sum(ctxt_emb * cand_emb, 2)
            loss = F.log_softmax(dot_product, dim=-1)
            loss = (-loss.sum(dim=1)).mean()
            return loss


In [None]:
AudioEncoder(poly_n_codes=16, # m, the number of global contxt features / 16, 64, 360
            #  poly_attention_type, # MultiHeadAttention으로 고정
            poly_attention_num_heads=8,
            #  codes_attention_type, # PolyBasicAttention으로 고정
            codes_attention_num_heads=2,
            embed_dim=512, 
            num_conv_layers=2, 
            in_channels=1, 
            conv_kernel_size=3, 
            pooling_kernel_size=2,
            num_att_layers=6, 
            dropout=0.15)


AudioEncoder(
  (code_attention): PolyBasicAttention()
  (attention): MultiheadAttention(
    (qkv_proj): Linear(in_features=512, out_features=1536, bias=True)
    (o_proj): Linear(in_features=512, out_features=512, bias=True)
  )
  (encoder): ModuleList(
    (0): ConvEncoder(
      (layers): ModuleList(
        (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (2): ReLU()
        (3): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (4): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (5): ReLU()
      )
      (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
    )
    (1): ConvEncoder(
      (layers): ModuleList(
        (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (2): ReLU()
        (3): Conv2d(256, 512, kernel_si

# hyperparameters
* [원논문](https://arxiv.org/pdf/1904.11660.pdf)
* For model optimization, we use the AdaDelta algorithm with fixed learning rate=1.0 and gradient clipping at 10.0. 
* 80 epochs

