<a href="https://colab.research.google.com/github/hanmilLee/for_ds9/blob/main/Day2_torch_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Day2
## Speech Command Recognition baseline with Torch


패키지 설치

In [1]:
!pip install torchaudio torchinfo
!pip install einops
# !pip install pyroomacoustics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.0-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Installing collected packages: einops
Successfully installed einops-0.4.1


In [2]:
import torch
import torch.fft
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torchinfo import summary

from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from torchaudio.datasets import SPEECHCOMMANDS

import numpy as np
import scipy
from scipy import signal

import soundfile as sf

import librosa
import librosa.display

import matplotlib
import matplotlib.pyplot as plt
import IPython.display as ipd

# import pyroomacoustics as pra

from einops import rearrange

import io
import os
import math
import glob
from tqdm import tqdm
from collections import OrderedDict


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


GPU 동작 확인

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


Public dataset 경로 \
https://drive.google.com/drive/folders/1CJuVPfiYsI4v1BK-SakT7frf8Nv-p-bn

In [5]:
!ls drive/MyDrive/public_set

npy  wav


In [6]:
!ls drive/MyDrive/public_set/npy

0  1  2


Train / Valid set Download

In [7]:
train_set = torchaudio.datasets.SPEECHCOMMANDS(root = '', download = True, subset = "training")
valid_set = torchaudio.datasets.SPEECHCOMMANDS(root = '', download = True, subset = "validation")

waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]
print(train_set[0])

  0%|          | 0.00/2.26G [00:00<?, ?B/s]

(tensor([[-0.0658, -0.0709, -0.0753,  ..., -0.0700, -0.0731, -0.0704]]), 16000, 'backward', '0165e0e8', 0)


In [8]:
labels = sorted(list(set(datapoint[2] for datapoint in valid_set)))

def label_to_index(word):
    return torch.tensor(labels.index(word))
    
print(' '.join(['{} {}'.format(label_to_index(label), label) for label in labels]))

0 backward 1 bed 2 bird 3 cat 4 dog 5 down 6 eight 7 five 8 follow 9 forward 10 four 11 go 12 happy 13 house 14 learn 15 left 16 marvin 17 nine 18 no 19 off 20 on 21 one 22 right 23 seven 24 sheila 25 six 26 stop 27 three 28 tree 29 two 30 up 31 visual 32 wow 33 yes 34 zero


Torch Dataset

In [9]:
class Custom_CommandDataset(Dataset):
    def __init__(self, dataset):
        super().__init__()
        self.command_dataset = dataset

    def __getitem__(self, idx):
        raw_wav = self.command_dataset[idx][0][0]
        raw_wav_pad = np.pad(raw_wav, (0, 16000 - raw_wav.shape[-1]))

        label_word = self.command_dataset[idx][2]
        label_index = label_to_index(label_word)
        # 여기서 데이터 처리 (CPU에서 진행된다. 메모리에 파일 올려놓았다가 여기서 처리하는게 효율적일 것이다.)

        return np.expand_dims(raw_wav_pad, 0), label_index

    def __len__(self):
        return len(self.command_dataset)

train_set_custom = Custom_CommandDataset(train_set)
valid_set_custom = Custom_CommandDataset(valid_set)

In [10]:
# 확인용
data = train_set_custom[7203]
print(data[1])
ipd.Audio(data[0], rate=16000)

tensor(4)


Data Loader

In [11]:
batch_size = 32
train_loader = DataLoader(train_set_custom, 
                          batch_size = batch_size, 
                          shuffle = True, 
                          pin_memory = True, 
                          num_workers = 2, 
                          worker_init_fn = lambda _: np.random.seed())

valid_loader = DataLoader(valid_set_custom, 
                          batch_size = batch_size, 
                          shuffle = False, 
                          pin_memory = True, 
                          num_workers = 2, 
                          worker_init_fn = lambda _: np.random.seed())

Train / Valid 함수

In [12]:
def number_of_correct(pred, target):
    return pred.squeeze().eq(target).sum().item()

def get_likely_index(tensor):
    return tensor.argmax(dim=-1)
    
def train(model, ep, opt, lossmod):
    model.train()
    losses = []
    for batch_idx, (wav, label) in enumerate(tqdm(train_loader, position = 0)):
        wav, label = wav.to(device), label.to(device)
        
        z = net(wav)
        loss = lossmod(z, label)
        opt.zero_grad()
        loss.backward()
        opt.step()

        losses.append(loss.item())
    print('Epoch: {} Loss: {:.4f}'.format(ep, np.average(losses)))

def validation(model, ep):
    model.eval()
    correct = 0
    losses = []
    for data, target in tqdm(valid_loader, position = 0):
        data = data.to(device)
        target = target.to(device)

        output = net(data)

        pred = get_likely_index(output)
        correct += number_of_correct(pred, target)
    
    print("Valid epoch: {}, acc: {:.4f}".format(ep, correct / len(valid_set)))

Model : 2D CNN

In [13]:
class ConvBlock2D(nn.Module):
    def __init__(self, c_i, c_o, k = 3, s = 1, pool_k = 2):
        super().__init__()
        self.block = nn.Sequential(nn.Conv2d(c_i, c_o, k, s),
                                   nn.BatchNorm2d(c_o),
                                   nn.ReLU(),
                                   nn.MaxPool2d(pool_k))
    def forward(self, x):
        return self.block(x)

class CNN2D(nn.Module):
    def __init__(self, c_in, c, num_classes):
        super().__init__()
        self.spec = torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_fft = 1024, hop_length=256, n_mels = 128)
        self.convnet = nn.Sequential(ConvBlock2D(c_in, c),
                                     ConvBlock2D(c, c),
                                     ConvBlock2D(c, 2 * c),
                                     ConvBlock2D(2 * c, 2 * c))
        self.linear = nn.Linear(2 * c, num_classes)
    
    def forward(self, x):
        x = torch.log10(self.spec(x) + 1e-5) # log-mel spectrogram: 1D audio -> 2D image-like representation
        z = self.convnet(x) # log-mel spec -> 2D CNN -> feature map C x F x T 
        z = rearrange(F.avg_pool2d(z, z.shape[-2:]), 'b c 1 1 -> b c') # feature map C x F x T -> feature C x 1 x 1
        z = self.linear(z) # feature C -> logit N
        return z

In [14]:
net = CNN2D(c_in = 1, c = 128, num_classes = len(labels)).to(device)
summary(net, (1, 1, 16000))

Layer (type:depth-idx)                   Output Shape              Param #
CNN2D                                    [1, 35]                   --
├─MelSpectrogram: 1-1                    [1, 1, 128, 63]           --
│    └─Spectrogram: 2-1                  [1, 1, 513, 63]           --
│    └─MelScale: 2-2                     [1, 1, 128, 63]           --
├─Sequential: 1-2                        [1, 256, 6, 2]            --
│    └─ConvBlock2D: 2-3                  [1, 128, 63, 30]          --
│    │    └─Sequential: 3-1              [1, 128, 63, 30]          1,536
│    └─ConvBlock2D: 2-4                  [1, 128, 30, 14]          --
│    │    └─Sequential: 3-2              [1, 128, 30, 14]          147,840
│    └─ConvBlock2D: 2-5                  [1, 256, 14, 6]           --
│    │    └─Sequential: 3-3              [1, 256, 14, 6]           295,680
│    └─ConvBlock2D: 2-6                  [1, 256, 6, 2]            --
│    │    └─Sequential: 3-4              [1, 256, 6, 2]            590,5

Training : 2D CNN

In [15]:
n_epoch = 10
opt = torch.optim.Adam(net.parameters(), lr = 1e-3)
ce = nn.CrossEntropyLoss()
for epoch in range(1, n_epoch + 1):
    train(net, epoch, opt, ce)
    validation(net, epoch)

100%|██████████| 2652/2652 [01:15<00:00, 35.28it/s]


Epoch: 1 Loss: 1.1543


100%|██████████| 312/312 [00:10<00:00, 30.35it/s]


Valid epoch: 1, acc: 0.8303


100%|██████████| 2652/2652 [00:56<00:00, 47.12it/s]


Epoch: 2 Loss: 0.4815


100%|██████████| 312/312 [00:05<00:00, 53.26it/s]


Valid epoch: 2, acc: 0.8643


100%|██████████| 2652/2652 [00:56<00:00, 46.90it/s]


Epoch: 3 Loss: 0.3633


100%|██████████| 312/312 [00:05<00:00, 54.21it/s]


Valid epoch: 3, acc: 0.8963


100%|██████████| 2652/2652 [00:56<00:00, 47.15it/s]


Epoch: 4 Loss: 0.2939


100%|██████████| 312/312 [00:05<00:00, 53.06it/s]


Valid epoch: 4, acc: 0.8987


100%|██████████| 2652/2652 [00:57<00:00, 45.93it/s]


Epoch: 5 Loss: 0.2400


100%|██████████| 312/312 [00:05<00:00, 54.21it/s]


Valid epoch: 5, acc: 0.9260


100%|██████████| 2652/2652 [00:56<00:00, 46.90it/s]


Epoch: 6 Loss: 0.2018


100%|██████████| 312/312 [00:05<00:00, 54.28it/s]


Valid epoch: 6, acc: 0.9188


100%|██████████| 2652/2652 [00:55<00:00, 47.46it/s]


Epoch: 7 Loss: 0.1718


100%|██████████| 312/312 [00:05<00:00, 54.35it/s]


Valid epoch: 7, acc: 0.9149


100%|██████████| 2652/2652 [00:56<00:00, 47.31it/s]


Epoch: 8 Loss: 0.1480


100%|██████████| 312/312 [00:05<00:00, 53.94it/s]


Valid epoch: 8, acc: 0.9243


100%|██████████| 2652/2652 [00:57<00:00, 46.26it/s]


Epoch: 9 Loss: 0.1282


100%|██████████| 312/312 [00:05<00:00, 53.16it/s]


Valid epoch: 9, acc: 0.9243


100%|██████████| 2652/2652 [01:01<00:00, 42.96it/s]


Epoch: 10 Loss: 0.1133


100%|██████████| 312/312 [00:05<00:00, 54.69it/s]

Valid epoch: 10, acc: 0.9141





For Evaluation

In [16]:
!ls drive/MyDrive/public_set/npy

ㅁㄴㅇ

0  1  2


In [17]:
public_testset_dir = 'drive/MyDrive/public_set/npy'
folder = '1'

In [18]:
wav_npy_list = glob.glob(os.path.join(public_testset_dir, folder, '*.npy'))
wav_npy_list.sort()
wav_npy_list = wav_npy_list[:-1]

ans_npy = np.load(os.path.join(public_testset_dir, folder, 'ans.npy'))

In [19]:
with torch.no_grad():
    correct = 0
    net.eval()
    preds = []
    for i, wav_npy_dir in enumerate(tqdm(wav_npy_list, position = 0, leave = True)):
        wav_npy = np.load(wav_npy_dir)
        wav_npy = torch.tensor(np.reshape(wav_npy, (1, 1, wav_npy.shape[0]))).to(device)
        ans = torch.tensor(ans_npy).to(device)
        
        output = net(wav_npy)

        pred = get_likely_index(output)
        correct += number_of_correct(pred, ans[i])        

        preds.append(pred.detach().cpu().numpy()[0])
    print("Acc: {:.4f}".format(correct / len(wav_npy_list)))

100%|██████████| 51/51 [00:19<00:00,  2.64it/s]

Acc: 0.4706





In [20]:
pred.detach().cpu().numpy()[0]

26

Save predict.npy \
data shape을 꼭 확인하세요!

In [21]:
# np.save('predict.npy', np.asarray(preds))