In [2]:
#!/usr/bin/env python
"""Test a pretrained CNN for Google speech commands."""

__author__ = 'Yuan Xu, Erdene-Ochir Tuguldur'

import argparse
import time
import csv
import os

from tqdm import *

import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader

from torchvision.transforms import *
import torchnet

from datasets import *
from transforms import *

In [3]:
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo

In [74]:
def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, bias=False, padding=1)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)
        #print("out {}".format(out.shape))
        #print("res {}".format(residual.shape))
        out += residual
        out = self.relu(out)
        
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, in_channels=2):
        self.inplanes = 45
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 45, kernel_size=3, stride=1, bias=False, padding=3)
        self.bn1 = nn.BatchNorm2d(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=(4, 3), stride=1)
        self.layer1 = self._make_layer(block, 45, layers[0], stride=2)
        self.layer2 = self._make_layer(block, 45, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 45, layers[2], stride=2)
        #self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((2, 1))
        self.fc = nn.Linear(90, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
                nn.ReLU(inplace=True)
            )
        if downsample:
            print("residual+")
        #print(downsample)
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        #x = self.layer4(x)
        #print(x.shape)
        x = self.avgpool(x)
        #print(x.shape)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

In [91]:
class Arg():
    def __init__(self):
        self.dataset_dir = "/home/cilab/LabMembers/DJ/sr_dataset/speech_command/train"
        self.batch_size = 128
        self.dataload_workers_nums = 3
        self.input = "mel40"
        self.multi_crop = True
        self.model = "./best-acc-resnet18.pth"

args = Arg()

In [92]:
print("loading model...")
model = torch.load(args.model)
model.float()
print(model)

loading model...
DataParallel(
  (module): ResNet(
    (conv1): Conv2d(1, 45, kernel_size=(3, 3), stride=(1, 1), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=(4, 3), stride=1, padding=0, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(45, 45, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (downsample): Sequential(
          (0): Conv2d(45, 45, kernel_size=(1, 1), stride=(2, 2), bias=False)
          (1): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=True

In [93]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

227712

In [94]:
#qmodel = torch.quantization.quantize_dynamic(model, dtype=torch.qint8)

In [95]:
#import sys
#sys.getsizeof(qmodel)

In [96]:
use_gpu = torch.cuda.is_available()
print('use_gpu', use_gpu)
if use_gpu:
    torch.backends.cudnn.benchmark = True
    model.cuda()

n_mels = 32
if args.input == 'mel40':
    n_mels = 40

dataset_dir = args.dataset_dir
feature_transform = Compose([ToMelSpectrogram(n_mels=n_mels), ToMfccFromSpectrogram(n_mels=n_mels), ToTensor('mfcc', 'input')])
transform = Compose([LoadAudio(), FixAudioLength(), feature_transform])
test_dataset = SpeechCommandsDataset(dataset_dir, transform, silence_percentage=0)
test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, sampler=None,
                            pin_memory=use_gpu, num_workers=args.dataload_workers_nums)

criterion = torch.nn.CrossEntropyLoss()

use_gpu True


In [97]:
def multi_crop(inputs):
    b = 1
    size = inputs.size(3) - b * 2
    patches = [inputs[:, :, :, i*b:size+i*b] for i in range(3)]
    outputs = torch.stack(patches)
    outputs = outputs.view(-1, inputs.size(1), inputs.size(2), size)
    outputs = torch.nn.functional.pad(outputs, (b, b, 0, 0), mode='replicate')
    return torch.cat((inputs, outputs.data))

def test():
    model.eval()  # Set model to evaluate mode

    #running_loss = 0.0
    #it = 0
    correct = 0
    total = 0
    confusion_matrix = torchnet.meter.ConfusionMeter(len(CLASSES))
    predictions = {}
    probabilities = {}

    pbar = tqdm(test_dataloader, unit="audios", unit_scale=test_dataloader.batch_size)
    for batch in pbar:
        inputs = batch['input']
        inputs = torch.unsqueeze(inputs, 1)
        targets = batch['target']

        n = inputs.size(0)
        if args.multi_crop:
            inputs = multi_crop(inputs)

        inputs = Variable(inputs, volatile = True)
        targets = Variable(targets, requires_grad=False)

        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda(async=True)

        # forward
        outputs = model(inputs)
        #loss = criterion(outputs, targets)
        outputs = torch.nn.functional.softmax(outputs, dim=1)
        
        if args.multi_crop:
            outputs = outputs.view(-1, n, outputs.size(1))
            outputs = torch.mean(outputs, dim=0)
            outputs = torch.nn.functional.softmax(outputs, dim=1)

        # statistics
        #it += 1
        #running_loss += loss.data[0]
        pred = outputs.data.max(1)[1]
        #print(targets.shape, pred.shape)
        #print((pred.data==targets.data).sum().item())
        correct +=(pred.data==targets.data).sum().item()
        total += targets.size(0)
        #print(pred.shape, targets.data.shape)
        confusion_matrix.add(pred, targets.data)
        filenames = batch['path']
        for j in range(len(pred)):
            fn = filenames[j]
            predictions[fn] = pred[j]
            probabilities[fn] = outputs.data[j].tolist()

    accuracy = correct/total
    #epoch_loss = running_loss / it
    print("accuracy: %f%%" % (100*accuracy))
    print("confusion matrix:")
    print(confusion_matrix.value())

    return probabilities, predictions

In [98]:
print("testing...")
probabilities, predictions = test()

  0%|          | 0/51200 [00:00<?, ?audios/s]

testing...


100%|██████████| 51200/51200 [00:33<00:00, 1531.47audios/s]

accuracy: 96.948403%
confusion matrix:
[[31406    73    54    55   109    78    88   212    98    42    94   241]
 [    0     0     0     0     0     0     0     0     0     0     0     0]
 [    2     6  1841     1     6     0     2     0     1     0     1     0]
 [   11     4     1  1810     4     6     2     2     1     3     3     6]
 [    8    14     1     0  1795     0     2     2     3    11     4     3]
 [    7     4     0     6     3  1809     2     2     4     0     3     2]
 [   13     5     3     1     6     0  1802     2     2     0     3     2]
 [    8     9     1     1     2     1     5  1814     3     2     1     5]
 [   26    10     1     2     6     1     2     3  1805     5     2     1]
 [   14     7     3     0    22     2     3     2     8  1771     3     4]
 [    5     4     2     3     4     4     2     0     3     0  1857     1]
 [   13     7     2     4     5     2     2     1     5     0     1  1819]]



