# Autotagging

In [1]:
import torch
import torchaudio
import IPython.display as ipd
from pathlib import Path
import pandas as pd

- Download Small MTAT if you need


In [None]:
!pip install gdown==4.4.0
!gdown 15e9E3oZdudErkPKwb0rCAiZXkPxdZkV6
!unzip -q mtat_8000.zip

In [6]:
class MTATDataset:
  def __init__(self, dir_path, split='train', num_max_data=6000, sr=16000):
    self.dir = Path(dir_path)
    self.labels = pd.read_csv(self.dir / "meta.csv", index_col=[0])
    self.sr = sr

    if split=="train":
      sub_dir_ids = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c']
    elif split=='valid':
      sub_dir_ids = ['d']
    else: #test
      sub_dir_ids = ['e', 'f', 'g']

    is_in_set = [True if x[0] in sub_dir_ids else False for x in self.labels['mp3_path'].values.astype('str')]
    self.labels = self.labels.iloc[is_in_set]
    self.labels = self.labels[:num_max_data]
    self.vocab = self.labels.columns.values[1:-1]
    self.label_tensor = self.convert_label_to_tensor()
  
  def convert_label_to_tensor(self):
    return torch.LongTensor(self.labels.values[:, 1:-1].astype('bool'))

  def __len__(self):
    return len(self.labels)

data_dir = Path('MTAT_SMALL')
dataset= MTATDataset(data_dir)

## Make Dataset

In [7]:
class OnFlyDataset(MTATDataset):
  def __init__(self, dir_path, split='train', num_max_data=6000, sr=16000):
    super().__init__(dir_path, split, num_max_data, sr)
#     self.resampler = torchaudio.transforms.Resample(orig_freq=22050, new_freq=self.sr)
  
  def __getitem__(self, idx):
    audio_path = self.labels['mp3_path'].iloc[idx]
    y, sr = torchaudio.load(self.dir / audio_path)
    
    if sr != self.sr:
      y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=self.sr)
    
    label = self.label_tensor[idx]
    return y[0], label
    

trainset = OnFlyDataset(data_dir)

## Make Model

In [8]:
import torch.nn as nn

class SpecModel(nn.Module):
  def __init__(self, sr, n_fft, hop_length, n_mels):
    super().__init__()
    self.mel_converter = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    self.db_converter = torchaudio.transforms.AmplitudeToDB()
  def forward(self, x):
    mel_spec = self.mel_converter(x)
    return self.db_converter(mel_spec)

class AudioModel(nn.Module):
  def __init__(self, sr, n_fft, hop_length, n_mels, hidden_size, num_output):
    super().__init__()
    self.sr = sr
    self.spec_converter = SpecModel(sr, n_fft, hop_length, n_mels)
    self.conv_layer = nn.Sequential(
      nn.Conv1d(n_mels, out_channels=hidden_size, kernel_size=3),
      nn.MaxPool1d(3),
      nn.ReLU(),
      nn.Conv1d(hidden_size, out_channels=hidden_size, kernel_size=3),
      nn.MaxPool1d(3),
      nn.ReLU(),     
      nn.Conv1d(hidden_size, out_channels=hidden_size, kernel_size=3),
      nn.MaxPool1d(3),
      nn.ReLU(),
    )
    self.final_layer = nn.Linear(hidden_size, num_output)

  def get_spec(self, x):
    '''
    Get result of self.spec_converter
    x (torch.Tensor): audio samples (num_batch_size X num_audio_samples)
    '''
    return self.spec_converter(x)
  
  def forward(self, x):
    spec = self.get_spec(x) # num_batch X num_mel_bins X num_time_bins
    out = self.conv_layer(spec)
    out = torch.max(out, dim=-1)[0] # select [0] because torch.max outputs tuple of (value, index)
    out = self.final_layer(out)
    out = torch.sigmoid(out)
    return out
  
model = AudioModel(16000, 1024, 512, 40, 128, 50)

In [63]:
n_mels = 40
hidden_size = 64

seq_conv_layer = nn.Sequential(
      nn.Conv1d(n_mels, out_channels=hidden_size, kernel_size=5),
      nn.MaxPool1d(4),
      nn.ReLU(),
      nn.Conv1d(hidden_size, out_channels=hidden_size, kernel_size=3),
      nn.MaxPool1d(3),
      nn.ReLU(),     
      nn.Conv1d(hidden_size, out_channels=hidden_size, kernel_size=3),
      nn.MaxPool1d(3),
      nn.ReLU(),
    )



In [64]:
audio, label = trainset[0]

In [84]:
mel_spec = model.spec_converter(audio)

In [85]:
mel_spec.shape # freq_bin (channel) x time_bin  

torch.Size([40, 911])

In [86]:
model_out = seq_conv_layer(mel_spec) 
# model_out, 
model_out.shape

torch.Size([64, 24])

In [91]:
model_out.shape

torch.Size([64, 24])

In [101]:
final_test = nn.Linear(64, 50, bias=False)
test_out = final_test(model_out.T)
test_out, test_out.shape

(tensor([[-1.9713, -1.7584,  3.0312,  ..., -2.3761, -3.1079,  1.4732],
         [-1.4117, -0.7950,  0.9067,  ..., -1.1927, -1.9066,  0.0404],
         [-1.6910, -0.9139,  1.3879,  ..., -1.1623, -1.5955,  0.2827],
         ...,
         [-0.9937, -0.2435,  1.1672,  ..., -1.3833, -1.3047,  0.1060],
         [-1.0243, -0.3593,  1.2652,  ..., -1.3761, -1.3017,  0.0996],
         [-1.4774, -0.4496,  1.2401,  ..., -1.6750, -1.4967,  0.1884]],
        grad_fn=<MmBackward0>),
 torch.Size([24, 50]))

In [97]:
final_test.weight.shape

torch.Size([50, 64])

In [104]:
final_conv = nn.Conv1d(64, 50, kernel_size=1, bias=False)
final_conv.weight.data[:,:,0] = final_test.weight.data
test_out = final_conv(model_out)
test_out.T, test_out.T.shape

(tensor([[-1.9713, -1.7584,  3.0312,  ..., -2.3761, -3.1079,  1.4732],
         [-1.4117, -0.7950,  0.9067,  ..., -1.1927, -1.9066,  0.0404],
         [-1.6910, -0.9139,  1.3879,  ..., -1.1623, -1.5955,  0.2827],
         ...,
         [-0.9937, -0.2435,  1.1672,  ..., -1.3833, -1.3047,  0.1060],
         [-1.0243, -0.3593,  1.2652,  ..., -1.3761, -1.3017,  0.0996],
         [-1.4774, -0.4496,  1.2401,  ..., -1.6750, -1.4967,  0.1884]],
        grad_fn=<PermuteBackward0>),
 torch.Size([24, 50]))

In [98]:
final_conv.weight.shape

torch.Size([50, 64, 1])

In [87]:
mean_pooling = torch.mean(model_out, dim=-1)
mean_pooling, mean_pooling.shape

(tensor([    0.0000,     2.6823,     0.0000,     1.2487,     0.8096,     0.1167,
             0.7276,     0.0365,     0.0000,     0.0000,     0.1344,     1.8576,
             0.0076,     1.2481,     2.8068,     0.0295,     0.0000,     0.1667,
             0.0296,     2.3879,     0.0510,     0.0772,     3.2861,     1.5576,
             0.4737,     3.7279,     4.4033,     4.3582,     2.3644,     2.3192,
             0.2317,     0.5021,     0.1446,     0.0000,     0.0000,     0.6143,
             7.1379,     0.6086,     0.0000,     0.3253,     0.0000,     2.1696,
             0.0000,     1.1167,     5.6040,     0.0143,     0.3262,     0.0161,
             6.1156,     0.0008,     0.0000,     0.0000,     0.4237,     1.8557,
             3.6960,     0.8464,     0.5737,     0.0186,     0.0000,     0.0136,
             0.0000,     0.0720,     3.8648,     0.0340],
        grad_fn=<MeanBackward1>),
 torch.Size([64]))

In [88]:
max_pooling = torch.max(model_out, dim=-1).values
max_pooling, max_pooling.shape

(tensor([    0.0000,     4.3618,     0.0000,     1.7843,     1.6946,     1.2563,
             1.9779,     0.8471,     0.0000,     0.0000,     1.3759,     4.5051,
             0.1820,     2.2768,     3.9449,     0.3172,     0.0000,     3.4557,
             0.2560,     9.2989,     0.5654,     1.0495,     8.2116,     2.4553,
             2.4427,    10.3835,     7.7901,    10.0643,     2.8923,     6.2799,
             0.9667,     1.9399,     1.1106,     0.0000,     0.0000,     6.5254,
            12.6467,     1.4461,     0.0000,     0.9304,     0.0000,     7.5213,
             0.0000,     2.1392,     8.5439,     0.3008,     1.1191,     0.2249,
            10.7428,     0.0197,     0.0000,     0.0000,     1.3934,     8.3979,
             9.0301,     4.3973,     2.4469,     0.4472,     0.0000,     0.2360,
             0.0000,     0.9428,     8.2263,     0.8164],
        grad_fn=<MaxBackward0>),
 torch.Size([64]))

In [90]:
num_tags =50
final_layer = nn.Linear(hidden_size, num_tags)

final_layer(max_pooling)

tensor([-0.3051, -2.8393, -0.2600, -1.5165, -4.5268, -2.7030,  0.5587,  2.5728,
        -0.6841, -2.0289,  1.3267, -0.3184, -2.6279, -4.4766,  0.1575,  2.0931,
        -0.5669,  0.9044, -2.9504, -5.9490, -0.4106,  0.5929,  0.2330,  1.7918,
         3.3626,  2.7782, -2.5826,  2.0242, -1.9924,  3.2855,  3.5405, -2.7477,
         1.4999,  0.0748,  3.2991,  1.1368, -0.7087, -3.0218,  6.4047, -0.3177,
         1.0323,  3.1941,  3.6477, -4.1344,  0.0329,  4.6421,  0.3591,  2.6181,
        -0.8550,  1.0683], grad_fn=<AddBackward0>)

In [71]:
torch.set_printoptions(sci_mode=False)

## How CNN works

In [22]:
dummy = torch.randint(-3, 4, (6,7)).float()
dummy = dummy.unsqueeze(0)
dummy, dummy.shape

(tensor([[[ 3., -3., -1., -3.,  3., -1.,  0.],
          [ 0., -2.,  3., -3., -3.,  1., -1.],
          [ 0., -2., -3., -2.,  1., -2.,  1.],
          [-1.,  1.,  1., -1.,  0.,  1.,  1.],
          [ 1.,  2., -2., -1.,  1., -1., -3.],
          [-3.,  2., -3.,  1.,  3., -1.,  2.]]]),
 torch.Size([1, 6, 7]))

In [16]:
conv_layer = nn.Conv2d(1, 1, kernel_size=3, bias=False)

In [20]:
conv_layer.weight.shape
conv_layer.weight.data = torch.randint(-1, 2, (1,1,3,3)).float()
conv_layer.weight

Parameter containing:
tensor([[[[-1.,  1.,  0.],
          [-1.,  1.,  0.],
          [ 1.,  1.,  1.]]]], requires_grad=True)

In [25]:
'''
2D Conv Layer의 입력은 3차원

'''
dummy, conv_layer.weight, conv_layer(dummy)

(tensor([[[ 3., -3., -1., -3.,  3., -1.,  0.],
          [ 0., -2.,  3., -3., -3.,  1., -1.],
          [ 0., -2., -3., -2.,  1., -2.,  1.],
          [-1.,  1.,  1., -1.,  0.,  1.,  1.],
          [ 1.,  2., -2., -1.,  1., -1., -3.],
          [-3.,  2., -3.,  1.,  3., -1.,  2.]]]),
 Parameter containing:
 tensor([[[[-1.,  1.,  0.],
           [-1.,  1.,  0.],
           [ 1.,  1.,  1.]]]], requires_grad=True),
 tensor([[[-13.,   0., -12.,   3.,   0.],
          [ -3.,   5.,  -5.,   3.,   3.],
          [  1.,  -2.,  -3.,   3.,  -5.],
          [ -1.,  -4.,   0.,   6.,   3.]]], grad_fn=<SqueezeBackward1>))

In [26]:
conv_layer(dummy[:, 0:3, 1:4])

tensor([[[0.]]], grad_fn=<SqueezeBackward1>)

In [41]:
num_ch = 4
dummy = torch.randint(-3, 4, (num_ch, 10)).float()
dummy

tensor([[ 2., -3.,  0., -3.,  0.,  2.,  2., -1.,  3.,  1.],
        [-2., -1.,  3., -3.,  2.,  0.,  0.,  2.,  3., -3.],
        [ 1., -2., -3.,  0.,  1.,  3., -3.,  1.,  3., -2.],
        [-1., -2., -2., -3.,  0., -3., -3., -1.,  1.,  3.]])

In [46]:
out_ch = 6
conv1d = nn.Conv1d(num_ch, out_ch, kernel_size=3, bias=False)
conv1d.weight.data = torch.randint(-1, 2, conv1d.weight.shape).float()
conv1d.weight

Parameter containing:
tensor([[[-1.,  1., -1.],
         [ 0.,  0., -1.],
         [ 1.,  0.,  1.],
         [ 0.,  0.,  0.]],

        [[-1.,  1., -1.],
         [-1.,  1., -1.],
         [ 0.,  1.,  0.],
         [ 1., -1.,  1.]],

        [[ 1., -1., -1.],
         [ 1.,  1.,  1.],
         [-1.,  1., -1.],
         [ 0.,  1.,  1.]],

        [[-1.,  1.,  1.],
         [-1., -1., -1.],
         [-1., -1., -1.],
         [ 1.,  1.,  1.]],

        [[ 1.,  1.,  1.],
         [-1.,  0.,  0.],
         [ 0.,  1.,  1.],
         [ 1., -1.,  1.]],

        [[ 0.,  1., -1.],
         [ 0.,  0., -1.],
         [ 0.,  0., -1.],
         [-1., -1.,  1.]]], requires_grad=True)

In [47]:
out = conv1d(dummy)
out, out.shape

(tensor([[-10.,   7.,  -7.,   4.,  -2.,   3.,  -9.,   5.],
         [-10.,   7., -10.,   1.,   1.,  -5.,  -7.,  11.],
         [  1.,  -7.,   4., -11.,  -3.,  -8.,   6.,   5.],
         [ -6.,  -1.,  -8.,  -4.,  -5., -11.,  -9.,   4.],
         [ -5., -11.,  -4.,   0.,   2.,   0.,   7.,   3.],
         [ -2.,   7.,  -1.,  -5.,   3.,   5.,  -5.,  10.]],
        grad_fn=<SqueezeBackward1>),
 torch.Size([6, 8]))

In [52]:
max_pool_layer = nn.MaxPool1d(3)

In [53]:
after_pool = max_pool_layer(out)
after_pool, after_pool.shape

(tensor([[ 7.,  4.],
         [ 7.,  1.],
         [ 4., -3.],
         [-1., -4.],
         [-4.,  2.],
         [ 7.,  5.]], grad_fn=<SqueezeBackward1>),
 torch.Size([6, 2]))