In [31]:
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn as nn
import torch
import torchvision
from torchvision import datasets, transforms
import torch.utils.data as data
import torchvision.models as models
import matplotlib.image as pli
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from PIL import Image
from PIL import ImageOps
from PIL import ImageEnhance
import random
import math
import pickle
import glob
import librosa
import os
import time
import scipy.signal as ss

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
print(torch.cuda.is_available())
path = './dataset/train'
labels = os.listdir(path)
audio_files = {l: glob.glob(f'{path}/{l}/*/*.pkl') for l in labels}
print(labels)

is_plot = False

freq_length = 57
time_length = 221
trainingset_size = 10000
batch_size = 64 if torch.cuda.is_available() else 8

True
['toothpaste_box', 'whiteboard_spray', 'toy_elephant', 'green_basketball', '061_foam_brick', 'shiny_toy_gun', 'salt_cylinder', 'strawberry', 'stanley_screwdriver', 'yellow_block']


In [32]:

class ImageSet(data.Dataset):
    def __init__(self):
        self.length = trainingset_size

    def __getitem__(self, index):
        # print(index)
        label = index % len(labels)
        audio_file = random.choice(audio_files[labels[label]])
        # audio_file = glob.glob(f'{path}/stanley_screwdriver/331/*.pkl')[0]

        data = np.load(audio_file, allow_pickle=True)
        audio = data['audio']
        sample_rate = data['audio_samplerate']

        stft_result = []
        for i in range(4):
            audio_resample = ss.resample(audio[:, i], audio.shape[0] // 4)
            stft_re = ss.stft(audio_resample, nperseg=512, noverlap=384)[2]
            stft_result.append(np.abs(stft_re))
        stft_result = np.array(stft_result)
        stft_result /= np.max(stft_result)
        # print(np.unravel_index(np.argmax(stft_result), stft_result.shape))

        time_mid = int(stft_result.shape[2] / 2)
        time_left = time_mid - 100
        time_right = time_left + time_length
        audio_map = stft_result[:, 0:freq_length, time_left:time_right]

        if is_plot:
            print(audio_file)
            print(f'audio shape = {audio.shape}')
            plt.plot(audio[:, 3])
            plt.show()
            print(f'audio_resample.shape = {audio_resample.shape}')
            plt.plot(audio_resample)
            plt.show()
            print(f'stft_result.shape = {stft_result.shape}')
            print(time_left)
            print(time_right)
            print(f'audio_map.shape = {audio_map.shape}')

            plt.imshow(audio_map[0], cmap='gray')
            plt.show()
            plt.imshow(audio_map[1], cmap='gray')
            plt.show()
            plt.imshow(audio_map[2], cmap='gray')
            plt.show()
            plt.imshow(audio_map[3], cmap='gray')
            plt.show()

        return audio_map, label

    def __len__(self):
        return self.length

train_loader = data.DataLoader(ImageSet(), batch_size=batch_size, shuffle=True)

In [36]:
class AudioCNN(nn.Module):
    def __init__(self,):
        super(AudioCNN, self).__init__()
        self.layer1 = nn.Sequential(
            # 57 221
            nn.Conv2d(in_channels=4, out_channels=64,
                      kernel_size=(3, 11)),
            # 55 211
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer1[0].out_channels, out_channels=64,
                      kernel_size=(3, 10), stride=(2, 3)),
            # 27 68
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        self.layer3 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer2[0].out_channels,
                      out_channels=128, kernel_size=(3, 5)),
            # 25 64
            nn.BatchNorm2d(128),
            nn.ReLU()
        )
        self.layer4 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer3[0].out_channels,
                      out_channels=128, kernel_size=(3, 7), stride=(2, 3)),
            # 12 20
            nn.BatchNorm2d(128),
            nn.ReLU()
        )
        self.layer5 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer4[0].out_channels,
                      out_channels=256, kernel_size=(3, 5)),
            # 10 16
            nn.BatchNorm2d(256),
            nn.ReLU()
        )
        self.avg_pool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
        self.fc = nn.Linear(self.layer5[0].out_channels, len(labels))

    def forward(self, input):
        out = self.layer1(input)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        # print(out.shape)
        out = self.avg_pool(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

In [37]:
convNet = AudioCNN()

In [38]:
state_dict = torch.load('./ConvNet.model')
convNet.load_state_dict(state_dict)

<All keys matched successfully>

In [39]:
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(convNet.parameters(), lr=0.01)

convNet.train()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
convNet = convNet.to(device)

for i, (imgs, lbs) in enumerate(train_loader):
    # print(int(round(time.time() * 1000)))
    imgs = imgs.float().to(device)
    lbs = lbs.to(device)
    outputs = convNet(imgs)
    loss = loss_func(outputs, lbs)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    predict = torch.argmax(F.softmax(outputs, dim=1), dim=1)
    # print(int(round(time.time() * 1000)))
    if i % 2 == 0:
        print(f"i = {i},  loss = {loss},  accuracy = {float(sum(lbs == predict))/float(lbs.size(0))}")

i = 0,  loss = 1.1396876573562622,  accuracy = 0.734375
i = 2,  loss = 1.2295805215835571,  accuracy = 0.5
i = 4,  loss = 1.2568227052688599,  accuracy = 0.609375
i = 6,  loss = 1.0423927307128906,  accuracy = 0.640625
i = 8,  loss = 1.0872704982757568,  accuracy = 0.671875
i = 10,  loss = 0.9369926452636719,  accuracy = 0.640625
i = 12,  loss = 0.9394078254699707,  accuracy = 0.609375
i = 14,  loss = 1.0458985567092896,  accuracy = 0.59375
i = 16,  loss = 1.040963888168335,  accuracy = 0.59375
i = 18,  loss = 0.9407866597175598,  accuracy = 0.6875
i = 20,  loss = 0.8923607468605042,  accuracy = 0.703125
i = 22,  loss = 0.8709472417831421,  accuracy = 0.65625
i = 24,  loss = 0.7210776805877686,  accuracy = 0.75
i = 26,  loss = 0.783420741558075,  accuracy = 0.734375
i = 28,  loss = 0.8580514192581177,  accuracy = 0.59375
i = 30,  loss = 0.8207448720932007,  accuracy = 0.734375
i = 32,  loss = 0.7845686078071594,  accuracy = 0.671875
i = 34,  loss = 0.6073460578918457,  accuracy = 0.781

In [40]:
# 保存模型， 请谨慎操作， 会覆盖文件中的模型
torch.save(convNet.state_dict(), './ConvNet.model')