In [1]:
# import sys
# import os

In [2]:
# sys.path.append('/home/super/waterffle/playlist')

In [3]:
# print(sys.path)

In [4]:
# from preprocessing import load_poly_encoder_dataset
# now_mfcc_list, next_mfcc_list, label_list = load_poly_encoder_dataset(2800)

In [5]:
# bsz * 10 개만 할당
# now_mfcc_list = torch.tensor(now_mfcc_list[:320])
# next_mfcc_list = torch.tensor(next_mfcc_list[:320])
# label_list = torch.tensor(label_list[:320])

In [6]:
# gpu 메모리 일부로 제한
# torch.cuda.set_per_process_memory_fraction(0.5,1)

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models

In [2]:
from preprocessing import load_poly_encoder_dataset

In [3]:
device_num = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

In [4]:
last_pad_length = 938
num_feature = 128

In [5]:
cs, ns, labels = load_poly_encoder_dataset(last_pad_length, 128)

  0%|          | 2/32000 [00:00<30:34, 17.45it/s]



100%|██████████| 32000/32000 [01:15<00:00, 422.54it/s] 


In [6]:
# bsz * 10 개만 할당
small_size = 3200
cs = torch.tensor(cs[:small_size])
ns = torch.tensor(ns[:small_size])
labels = torch.tensor(labels[:small_size])

In [7]:
cs = cs.view(-1, 1, last_pad_length, num_feature)
ns = ns.view(-1, 1, last_pad_length, num_feature)

In [8]:
cs.size(), ns.size()

(torch.Size([3200, 1, 938, 128]), torch.Size([3200, 1, 938, 128]))

In [9]:
cs[0].size()

torch.Size([1, 938, 128])

-----------------------------

In [12]:
import logging

logger = logging.getLogger(__name__)

In [13]:
def batchify(data, bsz):
    cs_rep, ns_rep, labels = data
    length = cs_rep.size()[0]
    for i in range(0, length, bsz):
        yield cs_rep[i:i+bsz], ns_rep[i:i+bsz], labels[i:i+bsz]

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from typing import Any, Dict, Optional, Tuple, Union
from __future__ import absolute_import, division, print_function, unicode_literals

from collections.abc import Iterable
from itertools import repeat

"""Near infinity, useful as a large penalty for scoring when inf is bad."""
NEAR_INF = 1e20
NEAR_INF_FP16 = 65504

def _pair(v):
    if isinstance(v, Iterable):
        assert len(v) == 2, "len(v) != 2"
        return v
    return tuple(repeat(v, 2))


def infer_conv_output_dim(conv_op, input_dim, sample_inchannel):
    sample_seq_len = 2000
    sample_bsz = 32
    x = torch.randn(sample_bsz, sample_inchannel, sample_seq_len, input_dim)
    # N x C x H x W
    # N: sample_bsz, C: sample_inchannel, H: sample_seq_len, W: input_dim
    x = conv_op(x)
    # N x C x H x W
    x = x.transpose(1, 2)
    # N x H x C x W
    bsz, seq = x.size()[:2]
    per_channel_dim = x.size()[3]
    # bsz: N, seq: H, CxW the rest
    return x.contiguous().view(bsz, seq, -1).size(-1), per_channel_dim


'''
Two 2-D convolutional blocks, each with two conv. layers with kernel size=3, max-pooling kernel=2. The first block has 64 feature maps while the second has 128
'''
class ConvEncoder(nn.Module):
    """
    VGG motibated cnn module https://arxiv.org/pdf/1409.1556.pdf
    code : https://github.com/pytorch/fairseq/blob/c36294ea4fd35eac757f417de9668b32c57d4b3d/fairseq/modules/vggblock.py#L38
    Args:
        in_channels: (int) number of input channels (typically 1)
        out_channels: (int) number of output channels
        conv_kernel_size: convolution channels
        pooling_kernel_size: the size of the pooling window to take a max over
        num_conv_layers: (int) number of convolution layers
        input_dim: (int) input dimension
        conv_stride: the stride of the convolving kernel.
            Can be a single number or a tuple (sH, sW)  Default: 1
        padding: implicit paddings on both sides of the input.
            Can be a single number or a tuple (padH, padW). Default: None
        layer_norm: (bool) if layer norm is going to be applied. Default: False
    Shape:
        Input: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
        Output: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
    """
    def __init__(self, 
                 in_channels, 
                 out_channels,
                 conv_kernel_size,
                 num_conv_layers,
                 pooling_kernel_size,
                 input_dim=None, 
                 conv_stride=1,
                 padding=None,
                 layer_norm=False
                 ):
        assert (
            input_dim is not None
        ), "Need input_dim for LayerNorm and infer_conv_output_dim"
        super(ConvEncoder, self).__init__()
        
        conv_kernel_size = _pair(conv_kernel_size)
        pooling_kernel_size = _pair(pooling_kernel_size)
        padding = (
            tuple(e // 2 for e in conv_kernel_size)
            if padding is None
            else _pair(padding)
        )
        conv_stride = _pair(conv_stride)
        
        self.layers = nn.ModuleList()

        # input_channels=1 : mfcc는 2d이므로 채널을 1로 봐야함.
        # kernel size 3
        for layer in range(num_conv_layers):
            conv_op = nn.Conv2d(
                in_channels if layer == 0 else out_channels,
                out_channels,
                conv_kernel_size,
                stride=conv_stride,
                padding=padding,
            )
            self.layers.append(conv_op)
            if layer_norm:
                self.conv_output_dim, per_channel_dim = infer_conv_output_dim(
                    conv_op, input_dim, in_channels if layer == 0 else out_channels
                )
                self.layers.append(nn.LayerNorm(per_channel_dim))
                input_dim = per_channel_dim
            #self.layers.append(nn.Dropout(p=0.2))
            self.layers.append(nn.ReLU())
        
        if pooling_kernel_size is not None:
            # ceil_mode : when True, will use ceil instead of floor to compute the output shape
            pool_op = nn.MaxPool2d(kernel_size=pooling_kernel_size, ceil_mode=True)
            self.layers.append(pool_op)
            self.total_output_dim, self.output_dim = infer_conv_output_dim(
                pool_op, input_dim, out_channels
            )
                
    def forward(self, x):
        for i, _ in enumerate(self.layers):
            x = self.layers[i](x)
        return x


class BasicAttention(nn.Module):
    """
    Implements simple/classical attention.
    """

    def __init__(
        self,
        dim: int = 1,
        attn: str = 'cosine',
        residual: bool = False,
        get_weights: bool = True,
    ):
        super().__init__()
        if attn == 'cosine':
            self.cosine = nn.CosineSimilarity(dim=dim)
        self.attn = attn
        self.dim = dim
        self.get_weights = get_weights
        self.residual = residual

    def forward(
        self,
        xs: torch.Tensor,
        ys: torch.Tensor,
        mask_ys: Optional[torch.Tensor] = None,
        values: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Compute attention.
        Attend over ys with query xs to obtain weights, then apply weights to
        values (ys if yalues is None)
        Args:
            xs: B x query_len x dim (queries)
            ys: B x key_len x dim (keys)
            mask_ys: B x key_len (mask)
            values: B x value_len x dim (values); if None, default to ys
        """
        bsz = xs.size(0)
        y_len = ys.size(1)
        x_len = xs.size(1)
        if self.attn == 'cosine':
            l1 = self.cosine(xs, ys).unsqueeze(self.dim - 1)
        else:
            l1 = torch.bmm(xs, ys.transpose(1, 2))
            if self.attn == 'sqrt':
                d_k = ys.size(-1)
                l1 = l1 / math.sqrt(d_k)
        if mask_ys is not None:
            attn_mask = (mask_ys == 0).view(bsz, 1, y_len)
            attn_mask = attn_mask.repeat(1, x_len, 1)
            l1.masked_fill_(attn_mask, neginf(l1.dtype))
        l2 = F.softmax(l1, dim=self.dim, dtype=torch.float).type_as(l1)
        if values is None:
            values = ys
        lhs_emb = torch.bmm(l2, values)

        # # add back the query
        if self.residual:
            lhs_emb = lhs_emb.add(xs)
        
        res = lhs_emb.squeeze(self.dim - 1)
        if self.get_weights:
            return res, l2
        else:
            return res
        

class PolyBasicAttention(BasicAttention):
    """
    Override basic attention to account for edge case for polyencoder.
    """

    def __init__(self, poly_type, n_codes, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.poly_type = poly_type
        self.n_codes = n_codes

    def forward(self, *args, **kwargs):
        """
        Forward pass.
        Account for accidental dimensionality reduction when num_codes is 1 and the
        polyencoder type is 'codes'
        """
        lhs_emb = super().forward(*args, **kwargs)
        if self.poly_type == 'codes' and self.n_codes == 1 and len(lhs_emb.shape) == 2:
            lhs_emb = lhs_emb.unsqueeze(self.dim - 1)
        return lhs_emb
    

def neginf(dtype: torch.dtype) -> float:
    """
    Return a representable finite number near -inf for a dtype.
    """
    if dtype is torch.float16:
        return -NEAR_INF_FP16
    else:
        return -NEAR_INF
    

class AudioEncoder(nn.Module):
    '''
    code : https://github.com/pytorch/fairseq/blob/c36294ea4fd35eac757f417de9668b32c57d4b3d/examples/speech_recognition/models/vggtransformer.py#L271
    '''
    def __init__(self,
                 poly_n_codes, # m, the number of global contxt features
                 poly_attention_num_heads,
                 num_att_layers,
                 codes_attention_num_heads,
                 embed_dim, 
                
                 input_feat_per_channel,
                 num_conv_block,
                 num_conv_layers, 
                 in_channels, 
                 out_channels,
                 conv_kernel_size=3, 
                 pooling_kernel_size=2,
                 layer_norm=False,
                 dropout=0.1,
                 reduction_type='first' # first, avg, max
                ):
        super(AudioEncoder, self).__init__()
                
        self.in_channels = in_channels
        self.input_dim = input_feat_per_channel
        self.reduction_type = reduction_type
        
        self.conv_encoder_block = nn.ModuleList()
        
        for i in range(num_conv_block):
            self.conv_encoder_block.append(
                ConvEncoder(
                    in_channels=in_channels, 
                    out_channels=out_channels,
                    conv_kernel_size=conv_kernel_size,
                    num_conv_layers=num_conv_layers,
                    pooling_kernel_size=pooling_kernel_size,
                    input_dim=input_feat_per_channel,
                    layer_norm=layer_norm
                    )
                )
            in_channels = out_channels
            input_feat_per_channel = self.conv_encoder_block[-1].output_dim
        
        self.conv_encoder_block = nn.Sequential(*self.conv_encoder_block)
        
        # conv_output_dim is the output dimension of conv encoder
        conv_output_dim = self.infer_conv_output_dim(self.in_channels, self.input_dim)
        
        self.n_codes = poly_n_codes
        self.attention_num_heads = poly_attention_num_heads
        self.codes_attention_num_heads = codes_attention_num_heads

        # the codes
        codes = torch.empty(self.n_codes, embed_dim)
        codes = torch.nn.init.uniform_(codes)
        self.codes = torch.nn.Parameter(codes)

        # attention for the codes
        self.code_attention = PolyBasicAttention(poly_type='codes', n_codes=self.n_codes, dim=2, attn='basic', get_weights=False)

        # The final attention (the one that takes the candidate as key)
        self.attention = nn.MultiheadAttention(embed_dim=embed_dim, 
                                               num_heads=self.attention_num_heads)
                                               #dropout=dropout)

        
        self.att_encoder_block = nn.ModuleList()
        self.sigmoid = nn.Sigmoid()
        
        # conv encoder를 거쳐나온 데이터의 dim과 embed_dim이 다를 경우 맞춰준다
        if conv_output_dim != embed_dim:
            self.att_encoder_block.append(nn.Linear(conv_output_dim, embed_dim))
        
        # SelfAttentionEncoder * num_att_layers
        for i in range(num_att_layers):
            self.att_encoder_block.append(
                torch.nn.TransformerEncoderLayer(d_model=embed_dim, 
                                                 nhead=self.attention_num_heads, 
                                                 dim_feedforward=embed_dim*4, 
                                                 dropout=dropout, 
                                                 activation='gelu')
            )
                            
        self.att_encoder_block = nn.Sequential(*self.att_encoder_block)
            
    def attend(self, attention_layer, queries, keys, values, mask):
        """
        Apply attention.
        :param attention_layer:
            nn.Module attention layer to use for the attention
        :param queries:
            the queries for attention
        :param keys:
            the keys for attention
        :param values:
            the values for attention
        :param mask:
            mask for the attention keys
        :return:
            the result of applying attention to the values, with weights computed
            wrt to the queries and keys.
        """
        if keys is None:
            keys = values
        if isinstance(attention_layer, PolyBasicAttention):
            return attention_layer(queries, keys, values=values, mask_ys=mask)
        elif isinstance(attention_layer, nn.MultiheadAttention):
            return attention_layer(query=queries, key=keys, value=values, attn_mask=mask)[0]

        else:
            raise Exception('Unrecognized type of attention')

    def encode(self, x_raw):
            # x_raw = [current song, next song]
            # next: candidate
            cs, ns, label = x_raw

            # padded tensor
            # B, C, T, F
            bsz, in_channels, max_seq_len, _ = ns.size()
        
            # cand mfcc를 conv encoder를 거친 emb
            cand_emb = self.conv_encoder_block(ns)
                        
            bsz, _, output_seq_len, _ = cand_emb.size()

            # (B, C, T, F) -> (B, T, C, F) -> (B, T, C*F)
            cand_emb = cand_emb.transpose(1, 2)
            cand_emb = cand_emb.contiguous().view(bsz, output_seq_len, -1)

            # transformer encoder
            cand_emb = self.att_encoder_block(cand_emb)
            
            # reduction : first, avg, max
            if self.reduction_type=='first':
                cand_emb = cand_emb[:,0,:]
            elif self.reduction_type == 'avg':
                cand_emb = torch.mean(cand_emb, dim=1)
            elif self.reduction_type == 'max':
                cand_emb = torch.max(cand_emb, dim=1).values
            else:
                raise KeyError('Not Registered reduction_type. Capable options : first, avg, and max')
            #print(f'cand_emb : {cand_emb.size()}')
            cand_emb = cand_emb.view(cand_emb.size()[0], 1, cand_emb.size()[1])
            
            # ctxt mfcc를 conv encoder를 거친 emb
            ctxt_out = self.conv_encoder_block(cs)
            b, c, t, f = ctxt_out.size()
            
            # (B, C, T, F) -> (B, T, C, F) -> (B, T, C*F)
            ctxt_out = ctxt_out.transpose(1, 2)
            ctxt_out = ctxt_out.contiguous().view(b, t, -1)
            
            # transformer encoder
            ctxt_out = self.att_encoder_block(ctxt_out)
            
            return ctxt_out, cand_emb, label
                
    def forward(self, 
                x_raw=None, 
                x_rep=None
               ):
        '''
        encoding 과정과 그 이후 과정을 나눈 이유는 먼저 계속 사용하는 피쳐를 처리해놓고 재사용하는 과정을 거치기 위해서이다.
        '''
        if x_raw is not None:
            
            return self.encode(x_raw)
        elif x_rep is not None:
            ctxt_out, cand_emb = x_rep

            # m개 만큼 context code를 반복
            # ctxt_out 값과 code를 내적한 값들의 softmax한 벡터 (w_1,...,w_m)를 이전 레이어 결과값(ctxt_out)과 곱해서 합한다.
            # 이 값이 m개의 global context features
            bsz = cand_emb.size(0)
            # global_ctxts = [b, poly_m, dim]
            global_ctxts = self.attend(attention_layer=self.code_attention , 
                                       queries=self.codes.repeat(bsz, 1, 1), 
                                       keys=ctxt_out,
                                       values=ctxt_out, 
                                       mask=None) 

            global_ctxts = global_ctxts.transpose(0,1)
            # cand_emb = [b,1,m] -> [1,b,m]
            cand_emb = cand_emb.transpose(0,1)
            
            # m개의 global context features를 cand_emb와 내적한 값을 softmax한 벡터를 (w_1,...,w_m)라 할 때, 이 가중치 값과 global contxt features를 곱해서 합한다.
            # 이 값이 최종 ctxt_emb
            # ctxt_emb = cand_emb와 같은 shape
            ctxt_emb = self.attend(attention_layer=self.attention ,
                                   queries=cand_emb,
                                   keys=global_ctxts,
                                   values=global_ctxts,
                                   mask=None)        
            
            #print('cand', cand_emb.size())
            #print('ctxt', ctxt_emb.size())
            
            # score: cand_emb와 ctxt_emb 간 cosine similarity값 (반환값)
            # scores = [1, bs]
            scores = torch.sum(ctxt_emb * cand_emb, -1)
            #print('score', scores.size())
            
            return scores
        else:
            raise Exception('Unsupported operation')
    
    def infer_conv_output_dim(self, in_channels, input_dim):
        sample_seq_len = 200
        sample_bsz = 10
        x = torch.randn(sample_bsz, in_channels, sample_seq_len, input_dim)
        for i, _ in enumerate(self.conv_encoder_block):
            x = self.conv_encoder_block[i](x)
        # (B, C, T, F) -> (B, T, C, F) -> (B, T, C*F)
        x = x.transpose(1, 2)
        mb, seq = x.size()[:2]
        return x.contiguous().view(mb, seq, -1).size(-1)


In [16]:
model = AudioEncoder(
                     poly_n_codes=64, # m, the number of global contxt features
                     poly_attention_num_heads=8, 
                     codes_attention_num_heads=2,
                     num_att_layers=6,
                     embed_dim=512, 
                     input_feat_per_channel=num_feature, # feature vector dimension
                     num_conv_block=2,
                     num_conv_layers=2, 
                     in_channels=1, 
                     out_channels=32,
                     conv_kernel_size=3, 
                     pooling_kernel_size=2,
                     layer_norm=True,
                     reduction_type='first'
)
print(model)

AudioEncoder(
  (conv_encoder_block): Sequential(
    (0): ConvEncoder(
      (layers): ModuleList(
        (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (2): ReLU()
        (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (4): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (5): ReLU()
        (6): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=True)
      )
    )
    (1): ConvEncoder(
      (layers): ModuleList(
        (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (2): ReLU()
        (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (4): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (5): ReLU()
        (6): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
def eval_model(model, dataloader, criterion):
    # layer 중 dropout layer와 같이 학습 시에는 사용하는데 inference할 때는 사용하지 않는 경우를 구분해주기 위함
    model.eval()
    
    n_steps = 0
    eval_loss = 0.0
    acc = 0.0

    # autograd engine을 끔(gradient 계산하지 않음) --> 메모리 사용량을 줄이고 연산 속도를 높이기 위함
    with torch.no_grad():
        # input은 이미 1차 임베딩은 되어 있는 것을 가져왔다고 가정한다.
        for idx, (batch_cs, batch_ns, batch_label) in enumerate(dataloader):
            #print('batch idx', idx)
            '''
            n_step+=1
            batch_labels.append(label) 
            scores.append(model(x_rep=[cs_rep, ns_rep]))

            if (idx+1) % batch_size == 0:
                mx_i = np.argmax(scores)
                predict_batch = [0]*batch_size
                predict_batch[mx_i] = 1
                predicts.extend(predict_batch)
                labels.extend(batch_labels)
                for b in range(batch_size):
                    if b == mx_i:
                        loss = criterion(1, batch_labels[b])
                    else:
                        loss = criterion(0, batch_labels[b])
                    
                    eval_loss += loss
            '''
            n_steps+=1
            # validation set to mfcc encoder
            cs_rep, ns_rep, label = model(x_raw=[batch_cs, batch_ns, batch_label])
            
            batch_label = torch.LongTensor([torch.argmax(label).item()]).to(device_num)

            score = model(x_rep=[cs_rep, ns_rep])

            #print('score', score)
            
            # cross entropy          
            loss = criterion(score, batch_label)
            
            eval_loss += loss.item()
            
            predict = torch.argmax(score, axis=1)
            #print(predict)
            #print(batch_label)
            if predict == batch_label:
                acc += 1


    results = {
        'eval_accuracy': acc / n_steps,
        'eval_loss': eval_loss / n_steps
    }
    return results
    

In [19]:
import time

def train(model, raw_data, optimizer, epochs, batch_size, output_dir, val_ratio=0.2, eval_freq=10, fp16=False, fp16_opt_level='O1'):
    # fp16=True면 amp를 통한 mixed preicision training을 한다는 의미
    # 사용 조건 : Volta 이상의 nvidia 그래픽 카드(v100, rtx2080ti, 등)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)
    
    def train_batch(data, bsz, shuffle=True):
        cs_rep, ns_rep, labels = data
        length = cs_rep.size()[0]
        
        for i in range(0, length, bsz):
            cs_batch = cs_rep[i:i+bsz]
            ns_batch = ns_rep[i:i+bsz]
            label_batch = labels[i:i+bsz]
            
            if shuffle:
                indexes = torch.randperm(bsz)              
                yield [cs_batch[indexes], ns_batch[indexes], label_batch[indexes]]
            else:
                yield [cs_batch, ns_batch, label_batch]
            #yield model(x_raw=[cs_rep[i:i+bsz], ns_rep[i:i+bsz], labels[i:i+bsz]])
            
    state_save_path = os.path.join(output_dir, '{}_{}_{}_pytorch_model.bin'.format('polyencoder', '64', 'dh'))
            
    best_eval_loss = float('inf')
    
    cs, ns, labels = raw_data

    start_time = time.time()
    # 재사용 가능한 임베딩들은 미리 뽑기
    trainset = None
    valset = None
    if val_ratio > 0:
        train_len = int((1-val_ratio)*len(labels))
        val_len = int(val_ratio*len(labels))
        trainset = [cs[:train_len], ns[:train_len], labels[:train_len]]
        valset = [cs[train_len:], ns[train_len:], labels[train_len:]]
        #trainset = [cs[val_len:], ns[val_len:], labels[val_len:]]
        #valset = [cs[:val_len], ns[:val_len], labels[:val_len]]
        print('train', trainset[0].shape, trainset[1].shape, trainset[2].shape)
        print('val', valset[0].shape, valset[1].shape, valset[2].shape)
    else:
        trainset = raw_data
    
    batch_labels = []
    scores = []
    global_step = 0
    #TODO : 나중에 tqdm으로 바꾸기
    
    for epoch in range(1, epochs+1):
        model.zero_grad()
        start_time = time.time()
        train_batches = train_batch(trainset, batch_size)
            
        total_loss = 0.0
        accuracy = 0.0
        local_step = 0
        # batch == 1set (data 32)
        for idx, (batch_cs, batch_ns, batch_label) in enumerate(train_batches):
            model.train()
            local_step += 1
            # mfcc embedding encoder
            cs_rep, ns_rep, label = model(x_raw=[batch_cs, batch_ns, batch_label])

            # label = [32]
            #label = torch.argmax(label).item().long()
            batch_label = torch.LongTensor([torch.argmax(label).item()]).to(device_num)

            # poly encoder
            # scores = [1,32] 
            scores = model(x_rep=[cs_rep, ns_rep])
            #print('scores', scores)
            
            # cross entropy          
            loss = criterion(scores, batch_label)
            
            total_loss += loss.item()
            
            if fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()
            
            predict = torch.argmax(scores, axis=1)
            #print(predict)
            #print(batch_label)
            if predict == batch_label:
                accuracy += 1

            '''
            # Scheduling
            if global_step and global_step % eval_freq == 0:
                print('score', scores)
                if valset is not None:
                    val_batches = train_batch(valset, batch_size)
                eval_results = eval_model(model, val_batches, criterion)
                # torch.optim.lr_scheduler.ReduceLROnPlateau는 val_loss를 입력으로 받음
                # 성능 향상이 없을 때 학습율을 낮춤.
                scheduler.step(eval_results['eval_loss'])
                print(f'Global Step {global_step} Eval loss : {eval_results["eval_loss"]}, Eval accuracy : {eval_results["eval_accuracy"]}')
                if eval_results["eval_loss"] < best_eval_loss:
                    best_eval_loss = eval_results['eval_loss']
                    torch.save(model.state_dict(), state_save_path)
                
            global_step += 1
            '''
        total_loss = total_loss / local_step
        accuracy = accuracy / local_step
        with open('log_32000_210903.txt', 'w') as f:
            print(f'\n[Epoch {epoch}] | total_Loss {total_loss} | accuracy {accuracy} | time {time.time()-start_time}\n')
            f.write(f'\n[Epoch {epoch}] | total_Loss {total_loss} | accuracy {accuracy} | time {time.time()-start_time}\n')
        
            if valset is not None:
                val_batches = train_batch(valset, batch_size, shuffle=True)
                eval_results = eval_model(model, val_batches, criterion)
                print(f'Eval loss : {eval_results["eval_loss"]}, Eval accuracy : {eval_results["eval_accuracy"]}')
                f.write(f'Eval loss : {eval_results["eval_loss"]}, Eval accuracy : {eval_results["eval_accuracy"]}')

#         if eval_results["eval_loss"] < best_eval_loss:
#             best_eval_loss = eval_results['eval_loss']
#             # save model.state_dict()
#             #torch.save(model.state_dict(), state_save_path)
# #             print(f'[Saving at] {state_save_path}')
        print('---------------------------------\n')
    
        '''
            # eval로 scheduler를 조절해야 하는가?
            #scheduler.step()
            # 해줘야 하나
            #model.zero_grad()
            
            global_step += 1
            
            eval_results = eval_model(val_batches)
            
            predict = torch.argmax(scores)
            if predict == 0:
                print('OK')
                accuracy += 1
            
            if eval_results['eval_loss'] < best_eval_loss:
                # save model.state_dict()
                torch.save(model.state_dict(), state_save_path)
                logger.info(f'Epoch {epoch} | Loss {loss.item()} | [Saving at] {state_save_path}')
        '''
                

        # save model.state_dict()
#         torch.save(model.state_dict(), state_save_path)
#         logger.info(f'Epoch {epoch} | Loss {loss.item()} | [Saving at] {state_save_path}')


In [20]:
# RuntimeError: Expected 4-dimensional input for 4-dimensional weight [256, 1, 3, 3], but got 3-dimensional input of size [320, 40, 2800] instead
# tdata = [now_mfcc_list.to(1), next_mfcc_list.to(1), label_list.to(1)]
tdata = [cs.to(device_num), ns.to(device_num), labels.to(device_num)]
# batch_size, C, H, W = 32, 1, 28, 28
# tdata = [torch.randn(batch_size, C, H, W).to(1),torch.randn(batch_size, C, H, W).to(1), torch.randn(batch_size).to(1)]

# Hyperparameters
epochs = 100 # epoch
batch_size=32 # batch size for training
lr=5e-5 # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.1, threshold=0.01, patience=5)

train(model=model.to(device_num), 
      raw_data=tdata, 
      optimizer=optimizer, 
      epochs=epochs, 
      batch_size=batch_size,
      output_dir='.',
      val_ratio=0.2, 
      eval_freq=10,
      fp16=False, 
      fp16_opt_level='O1')

train torch.Size([2560, 1, 938, 128]) torch.Size([2560, 1, 938, 128]) torch.Size([2560])
val torch.Size([640, 1, 938, 128]) torch.Size([640, 1, 938, 128]) torch.Size([640])

[Epoch 1] | total_Loss 6.421015283465385 | accuracy 0.0625 | time 31.40020728111267

Eval loss : 3.67012619972229, Eval accuracy : 0.0
---------------------------------


[Epoch 2] | total_Loss 4.931183034740388 | accuracy 0.05 | time 31.41671371459961

Eval loss : 3.6161760807037355, Eval accuracy : 0.0
---------------------------------


[Epoch 3] | total_Loss 4.508502745628357 | accuracy 0.025 | time 31.405723810195923

Eval loss : 3.621930170059204, Eval accuracy : 0.0
---------------------------------


[Epoch 4] | total_Loss 4.010581113398075 | accuracy 0.0875 | time 31.424458503723145

Eval loss : 3.759340834617615, Eval accuracy : 0.0
---------------------------------


[Epoch 5] | total_Loss 3.741730653261766 | accuracy 0.0875 | time 31.43504524230957

Eval loss : 4.400401079654694, Eval accuracy : 0.0
---

Eval loss : 29.2113347530365, Eval accuracy : 0.0
---------------------------------


[Epoch 48] | total_Loss 0.362681138484383 | accuracy 0.9125 | time 31.447262048721313

Eval loss : 25.047244119644166, Eval accuracy : 0.0
---------------------------------


[Epoch 49] | total_Loss 0.3481323160866814 | accuracy 0.9375 | time 31.447887182235718

Eval loss : 16.748266649246215, Eval accuracy : 0.0
---------------------------------


[Epoch 50] | total_Loss 0.4836579773665365 | accuracy 0.9 | time 31.447824478149414

Eval loss : 14.943757912516594, Eval accuracy : 0.05
---------------------------------


[Epoch 51] | total_Loss 0.095834329180337 | accuracy 0.9625 | time 31.449638843536377

Eval loss : 29.85351159572601, Eval accuracy : 0.0
---------------------------------


[Epoch 52] | total_Loss 0.20720743866793345 | accuracy 0.95 | time 31.45248532295227

Eval loss : 8.283193755149842, Eval accuracy : 0.0
---------------------------------


[Epoch 53] | total_Loss 0.0833971527784823

Eval loss : 78.74414196014405, Eval accuracy : 0.0
---------------------------------


[Epoch 95] | total_Loss 0.6860778550336999 | accuracy 0.925 | time 31.448648691177368

Eval loss : 30.604440021514893, Eval accuracy : 0.0
---------------------------------


[Epoch 96] | total_Loss 0.7822447508747633 | accuracy 0.8875 | time 31.442970275878906

Eval loss : 76.2183967590332, Eval accuracy : 0.0
---------------------------------


[Epoch 97] | total_Loss 0.13747063485123245 | accuracy 0.975 | time 31.440403938293457

Eval loss : 80.17519359588623, Eval accuracy : 0.0
---------------------------------


[Epoch 98] | total_Loss 0.14170609021100794 | accuracy 0.9625 | time 31.44185447692871

Eval loss : 75.18725085258484, Eval accuracy : 0.0
---------------------------------


[Epoch 99] | total_Loss 0.03651561323729435 | accuracy 0.975 | time 31.444196462631226

Eval loss : 76.35005462169647, Eval accuracy : 0.0
---------------------------------


[Epoch 100] | total_Loss 0.009493711554

--------------------------------------------

In [1]:
!nvidia-smi

Sun Sep 12 15:08:14 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| 30%   39C    P8    33W / 350W |     23MiB / 24268MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:02:00.0 Off |                  N/A |
| 30%   35C    P8    28W / 350W |      5MiB / 24268MiB |      0%      Defaul

In [62]:
a = torch.randn(1,10)
m = nn.Sigmoid()
print(a)
print(m(a))

tensor([[ 0.7460,  0.4928,  0.2707, -1.0255, -1.4475, -0.3103,  1.4932,  1.5739,
          0.8782, -0.8915]])
tensor([[0.6783, 0.6208, 0.5673, 0.2640, 0.1904, 0.4230, 0.8166, 0.8283, 0.7065,
         0.2908]])
