<a href="https://colab.research.google.com/github/jdasam/ant5015/blob/main/notebooks/5th_week_auto_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Autotagging

In [1]:
import torch
import torchaudio
import IPython.display as ipd
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

- Download Small MTAT if you need


In [2]:
!pip install --upgrade gdown
!gdown 15e9E3oZdudErkPKwb0rCAiZXkPxdZkV6
# !wget https://sogang365-my.sharepoint.com/:u:/g/personal/dasaem_jeong_o365_sogang_ac_kr/EdkHWV-qvxBEi-d0Ua73VG4BEp7EZO7HMvrXsWqeJvMJzg?e=GbYylV&download=1

!unzip -q mtat_8000.zip

Collecting gdown
  Downloading gdown-5.1.0-py3-none-any.whl (17 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.7.3
    Uninstalling gdown-4.7.3:
      Successfully uninstalled gdown-4.7.3
Successfully installed gdown-5.1.0
Downloading...
From (original): https://drive.google.com/uc?id=15e9E3oZdudErkPKwb0rCAiZXkPxdZkV6
From (redirected): https://drive.google.com/uc?id=15e9E3oZdudErkPKwb0rCAiZXkPxdZkV6&confirm=t&uuid=16cc8cd6-176e-4394-8d70-c5eaa7ce5c35
To: /content/mtat_8000.zip
100% 921M/921M [00:05<00:00, 164MB/s]


In [3]:
class MTATDataset:
  def __init__(self, dir_path, split='train', num_max_data=6000, sr=16000):
    self.dir = Path(dir_path)
    self.labels = pd.read_csv(self.dir / "meta.csv", index_col=[0])
    self.sr = sr

    if split=="train":
      sub_dir_ids = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c']
    elif split=='valid':
      sub_dir_ids = ['d']
    else: #test
      sub_dir_ids = ['e', 'f', 'g']

    is_in_set = [True if x[0] in sub_dir_ids else False for x in self.labels['mp3_path'].values.astype('str')]
    self.labels = self.labels.iloc[is_in_set]
    self.labels = self.labels[:num_max_data]
    self.vocab = self.labels.columns.values[1:-1]
    self.label_tensor = self.convert_label_to_tensor()

  def convert_label_to_tensor(self):
    return torch.tensor(self.labels.values[:, 1:-1].astype('bool'), dtype=torch.float)

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    info = self.labels.iloc[idx]
    mp3_path = self.dir / info['mp3_path']

    audio, sr = torchaudio.load(mp3_path)
    assert sr == self.sr
    label = self.label_tensor[idx]
    return audio.mean(0), label

data_dir = Path('MTAT_SMALL')
dataset= MTATDataset(data_dir)
dataset[0]

(tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -8.8974e-08,
         -5.8156e-08, -5.6856e-08]),
 tensor([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]))

In [5]:
csv_fn = 'MTAT_SMALL/meta.csv'
meta = pd.read_csv(csv_fn, index_col=[0])
meta

Unnamed: 0,clip_id,singer,harpsichord,sitar,heavy,foreign,no piano,classical,female,jazz,...,rock,dance,cello,techno,flute,beat,soft,choir,baroque,mp3_path
20552,45147,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,2/zephyrus-angelus-11-ave_maria__virgo_serena_...
3899,8539,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,a/tilopa-pictures_of_silence-02-ni-175-204.mp3
8996,19647,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5/arthur_yoria-of_the_lovely-04-several_mistak...
4055,8856,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/stargarden-music_for_modern_listening-02-per...
6361,13834,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,a/dac_crowell-the_mechanism_of_starlight-03-me...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11433,25167,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,e/jeff_wahl-meditative_guitar-05-the_persisten...
4047,8845,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/satori-healing_sounds_of_tibet-02-passion_of...
2499,5418,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/dac_crowell-redshifted_harmonies-01-tranquil...
2342,5122,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,f/three_holies_church_choristers-the_hymns_of_...


In [16]:
audio, label = dataset[15]
ipd.display(ipd.Audio(audio, rate=16000))
dataset.vocab[torch.where(label)[0]]

array(['guitar', 'male'], dtype=object)

In [17]:
dataset.vocab

array(['singer', 'harpsichord', 'sitar', 'heavy', 'foreign', 'no piano',
       'classical', 'female', 'jazz', 'guitar', 'quiet', 'solo', 'folk',
       'ambient', 'new age', 'synth', 'drum', 'bass', 'loud', 'string',
       'opera', 'fast', 'country', 'violin', 'electro', 'trance', 'chant',
       'strange', 'modern', 'hard', 'harp', 'pop', 'female vocal',
       'piano', 'orchestra', 'eastern', 'slow', 'male', 'vocal',
       'no singer', 'india', 'rock', 'dance', 'cello', 'techno', 'flute',
       'beat', 'soft', 'choir', 'baroque'], dtype=object)

In [18]:
import time

start_time = time.time()
for i in range(500):
  dataset[i]
end_time = time.time()
print(end_time - start_time)

8.703536033630371


## Make Dataset

## Make Model

In [None]:
train_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
batch = next(iter(train_loader))
audios, labels = batch

In [30]:
import torch.nn as nn

class MelDbConverter(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.mel_conv = torchaudio.transforms.MelSpectrogram(n_fft=1024,
                                                     hop_length=512,
                                                     f_min=20,
                                                     f_max=4000,
                                                     n_mels=80,
                                                     sample_rate=16000)
    self.db_conv = torchaudio.transforms.AmplitudeToDB()

  def forward(self, x):
    return self.db_conv(self.mel_conv(x)) / 100



class CNNModel(nn.Module):
  def __init__(self, dim=8, num_out=10):
    super().__init__() # nn.Module init

    self.spec = MelDbConverter()
    self.layers = []
    in_channel = 1
    hidden_channel = dim
    for i in range(5):
      conv = nn.Conv2d(in_channels=in_channel, out_channels=hidden_channel, kernel_size=3, padding=1)
      self.layers.append(conv)
      self.layers.append(nn.ReLU())
      pool = nn.MaxPool2d(2)
      self.layers.append(pool)
      in_channel = hidden_channel
      hidden_channel *= 2
    self.layers = nn.Sequential(*self.layers)
    self.proj = nn.Linear(hidden_channel, num_out)

  def forward(self, x):
    spec = self.spec(x)
    spec = spec.unsqueeze(1)
    conv_out = self.layers(spec)
    # print(conv_out.shape)
    conv_out = torch.max(conv_out, dim=-1).values
    # print(conv_out.shape)
    conv_out = conv_out.flatten(1)
    logit = self.proj(conv_out)
    # prob = logit.softmax(dim=-1)
    prob = logit.sigmoid()
    return prob

model = CNNModel(num_out=len(dataset.vocab))

out = model(audios)
out.shape

torch.Size([16, 50])

In [31]:
out

tensor([[0.5014, 0.4924, 0.4807, 0.5024, 0.4813, 0.4906, 0.5045, 0.5036, 0.4886,
         0.4967, 0.4904, 0.5047, 0.5148, 0.4848, 0.5098, 0.4986, 0.4945, 0.4831,
         0.4993, 0.5073, 0.5049, 0.5118, 0.4893, 0.4961, 0.4921, 0.5019, 0.4835,
         0.4951, 0.4965, 0.4911, 0.5083, 0.5011, 0.5032, 0.5078, 0.4931, 0.4996,
         0.4964, 0.5137, 0.5006, 0.5076, 0.4979, 0.4964, 0.4908, 0.4941, 0.5028,
         0.4906, 0.5118, 0.4978, 0.4885, 0.4908],
        [0.5019, 0.4925, 0.4804, 0.5030, 0.4807, 0.4908, 0.5044, 0.5030, 0.4888,
         0.4967, 0.4903, 0.5050, 0.5152, 0.4852, 0.5090, 0.4985, 0.4950, 0.4832,
         0.4994, 0.5078, 0.5048, 0.5119, 0.4893, 0.4962, 0.4918, 0.5025, 0.4836,
         0.4946, 0.4961, 0.4914, 0.5083, 0.5008, 0.5033, 0.5078, 0.4931, 0.5003,
         0.4968, 0.5143, 0.5015, 0.5076, 0.4980, 0.4966, 0.4914, 0.4941, 0.5026,
         0.4909, 0.5115, 0.4973, 0.4888, 0.4908],
        [0.5020, 0.4930, 0.4791, 0.5040, 0.4801, 0.4906, 0.5041, 0.5035, 0.4888,
         

In [24]:
batch = next(iter(train_loader))

RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x7168 and 1024x50)

## Train!

## Data Normalization

## How CNN works

## Batch Norm