<a href="https://colab.research.google.com/github/littlejacinthe/torchaudio/blob/main/ExtractingMelSpec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Tutorial from The Sound of AI YT Channel](https://www.youtube.com/watch?v=lhF_RVa7DLE&list=PL-wATfeyAMNoirN4idjev6aRu8ISZYVWm&index=5)


Extracting Mel Spectrograms

In [5]:
# using transforms module

import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import os

In [2]:
#DL dataset
# Unzip dataset
!wget https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz -O urban8k.tgz
!tar -xzf urban8k.tgz
!rm urban8k.tgz

--2022-08-24 09:51:13--  https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6023741708 (5.6G) [application/octet-stream]
Saving to: ‘urban8k.tgz’


2022-08-24 09:56:32 (18.1 MB/s) - ‘urban8k.tgz’ saved [6023741708/6023741708]



In [6]:
class UrbanSoundDataset(Dataset):

  def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate):
    self.annotations = pd.read_csv(annotations_file)
    self.audio_dir = audio_dir
    self.transformation = transformation
    self.target_sample_rate = target_sample_rate

  def __len__(self):
    # how to calculate the length of the dataset so len(dataset)
    return len(self.annotations)

  def __getitem__(self, index):
    # list[1] -> list.__getitem__(1)
    audio_sample_path = self.get_audio_sample_path(index)
    label = self.get_audio_sample_label(index)
    signal, sr = torchaudio.load(audio_sample_path)
    signal = self.resample_if_necessary(signal, sr)
    signal = self._mix_down_if_necessary(signal)
    signal = self.transformation(signal)

    return signal, label

  def resample_if_necessary(self, signal, sr):
    if sr != self.target_sample_rate:
      resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
      signal = resampler(signal)
    return signal

  def _mix_down_if_necessary(self, signal):
    if signal.shape[0] > 1:
      signal = torch.mean(signal, dim=0, keepdim=True) #[2, 16000] -> [1, 16000]
    return signal

  def get_audio_sample_path(self, index):
    fold = f"fold{self.annotations.iloc[index, 5]}"
    path = os.path.join(self.audio_dir, fold, self.annotations.iloc[index, 0])
    return path

  def get_audio_sample_label(self, index):
    return self.annotations.iloc[index, 6]

In [8]:
if __name__ == "__main__":

  ANNOTATIONS_FILE = "/content/UrbanSound8K/metadata/UrbanSound8K.csv"
  AUDIO_DIR = "/content/UrbanSound8K/audio"
  SAMPLE_RATE = 16000

  mel_spectrogram = torchaudio.transforms.MelSpectrogram(
      sample_rate=SAMPLE_RATE, 
      n_fft = 1024,
      hop_length = 512,
      n_mels = 64
      )

  usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE)
  print(f"There are {len(usd)} samples in the dataset")

  signal, label = usd[0]

  a = 1

There are 8732 samples in the dataset


With the GTZAN Dataset

In [9]:
#upload your .json file with your own Kaggle key
! mkdir /root/.kaggle
! mv /content/kaggle.json /root/.kaggle

In [10]:
# Download dataset
! kaggle datasets download -d andradaolteanu/gtzan-dataset-music-genre-classification

Downloading gtzan-dataset-music-genre-classification.zip to /content
 99% 1.20G/1.21G [00:18<00:00, 102MB/s]
100% 1.21G/1.21G [00:18<00:00, 68.7MB/s]


In [None]:
#unzip the folder
! unzip /content/gtzan-dataset-music-genre-classification.zip

In [32]:
class GTZANDataset(Dataset):

  def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate):
    self.annotations = pd.read_csv(annotations_file)
    self.audio_dir = audio_dir
    self.transformation = transformation
    self.target_sample_rate = target_sample_rate

  def __len__(self):
    # how to calculate the length of the dataset so len(dataset)
    return len(self.annotations)

  def __getitem__(self, index):
    # list[1] -> list.__getitem__(1)
    audio_sample_path = self.get_audio_sample_path(index)
    label = self.get_audio_sample_label(index)
    signal, sr = torchaudio.load(audio_sample_path)
    signal = self.resample_if_necessary(signal, sr)
    signal = self._mix_down_if_necessary(signal)
    signal = self.transformation(signal)

    return signal, label

  def resample_if_necessary(self, signal, sr):
    if sr != self.target_sample_rate:
      resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
      signal = resampler(signal)
    return signal

  def _mix_down_if_necessary(self, signal):
    if signal.shape[0] > 1:
      signal = torch.mean(signal, dim=0, keepdim=True) #[2, 16000] -> [1, 16000]
    return signal

  def get_audio_sample_path(self, index):
    fold = self.annotations.iloc[index, 59]
    path = os.path.join(self.audio_dir, fold, self.annotations.iloc[index, 0])
    return path

  def get_audio_sample_label(self, index):
    return self.annotations.iloc[index, 6]

In [33]:
if __name__ == "__main__":

  AUDIO_DIR = "/content/Data/genres_original"
  ANNOTATIONS_FILE = "/content/Data/features_30_sec.csv"
  SAMPLE_RATE = 16000

  mel_spectrogram = torchaudio.transforms.MelSpectrogram(
      sample_rate=SAMPLE_RATE, 
      n_fft = 1024,
      hop_length = 512,
      n_mels = 64
      )

  audio = GTZANDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE)
  print(f"There are {len(audio)} samples in the dataset")

  signal, label = audio[0]

  a = 1

There are 1000 samples in the dataset
