In [7]:
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn as nn
import torch
import torchvision
from torchvision import datasets, transforms
import torch.utils.data as data
import torchvision.models as models
import matplotlib.image as pli
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from PIL import Image
from PIL import ImageOps
from PIL import ImageEnhance
import random
import math
import pickle
import glob
import librosa
import os
import time

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
print(torch.cuda.is_available())
path = './dataset/train'
labels = os.listdir(path)
audio_train_files = {l: glob.glob(f'{path}/{l}/[0-9][0-9]*/*.pkl') for l in labels}
audio_val_files = {l: glob.glob(f'{path}/{l}/[0-9]/*.pkl') for l in labels}
# print(audio_train_files)
# print(audio_test_files)
print(labels)

is_plot = False

freq_length = 57
time_length = 221
trainingset_size = 10000
val_set_size = 100
batch_size = 64 if torch.cuda.is_available() else 8

True
['toothpaste_box', 'whiteboard_spray', 'toy_elephant', 'green_basketball', '061_foam_brick', 'shiny_toy_gun', 'salt_cylinder', 'strawberry', 'stanley_screwdriver', 'yellow_block']


In [8]:
from STFT import STFT
class ImageSet(data.Dataset):
    def __init__(self, behav):
        if behav == 'train':
            self.length = trainingset_size
        elif behav == 'val':
            self.length = val_set_size
        else:
            raise Exception('Error')
        self.behav = behav

    def __getitem__(self, index):
        # print(index)
        label = index % len(labels)
        if self.behav == 'train':
            audio_file = random.choice(audio_train_files[labels[label]])
        elif self.behav == 'val':
            count = (index // len(labels)) % len(audio_val_files[labels[label]])
            audio_file = audio_val_files[labels[label]][count]
        else:
            raise Exception('Error')
        # audio_file = glob.glob(f'{path}/stanley_screwdriver/331/*.pkl')[0]
        audio_map = STFT(audio_file,is_plot, freq_length, time_length)
        return audio_map, label

    def __len__(self):
        return self.length

train_loader = data.DataLoader(ImageSet('train'), batch_size=batch_size, shuffle=True)

In [9]:
class AudioCNN(nn.Module):
    def __init__(self,):
        super(AudioCNN, self).__init__()
        self.layer1 = nn.Sequential(
            # 57 221
            nn.Conv2d(in_channels=4, out_channels=64,
                      kernel_size=(3, 11)),
            # 55 211
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer1[0].out_channels, out_channels=64,
                      kernel_size=(3, 10), stride=(2, 3)),
            # 27 68
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        self.layer3 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer2[0].out_channels,
                      out_channels=128, kernel_size=(3, 5)),
            # 25 64
            nn.BatchNorm2d(128),
            nn.ReLU()
        )
        self.layer4 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer3[0].out_channels,
                      out_channels=128, kernel_size=(3, 7), stride=(2, 3)),
            # 12 20
            nn.BatchNorm2d(128),
            nn.ReLU()
        )
        self.layer5 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer4[0].out_channels,
                      out_channels=256, kernel_size=(3, 5)),
            # 10 16
            nn.BatchNorm2d(256),
            nn.ReLU()
        )
        self.avg_pool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
        self.fc = nn.Linear(self.layer5[0].out_channels, len(labels))

    def forward(self, input):
        out = self.layer1(input)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        # print(out.shape)
        out = self.avg_pool(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

In [10]:
convNet = AudioCNN()

In [11]:
state_dict = torch.load('./ConvNet.model')
convNet.load_state_dict(state_dict)

<All keys matched successfully>

In [12]:
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(convNet.parameters(), lr=0.01)

convNet.train()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
convNet = convNet.to(device)

for i, (imgs, lbs) in enumerate(train_loader):
    # print(int(round(time.time() * 1000)))
    imgs = imgs.float().to(device)
    lbs = lbs.to(device)
    outputs = convNet(imgs)
    loss = loss_func(outputs, lbs)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    predict = torch.argmax(F.softmax(outputs, dim=1), dim=1)
    # print(int(round(time.time() * 1000)))
    if i % 2 == 0:
        print(f"i = {i},  loss = {loss},  accuracy = {float(sum(lbs == predict))/float(lbs.size(0))}")

i = 0,  loss = 0.426588773727417,  accuracy = 0.828125
i = 2,  loss = 0.36382123827934265,  accuracy = 0.84375
i = 4,  loss = 0.47178855538368225,  accuracy = 0.875
i = 6,  loss = 0.19413381814956665,  accuracy = 0.953125
i = 8,  loss = 0.31345587968826294,  accuracy = 0.921875
i = 10,  loss = 0.44079986214637756,  accuracy = 0.8125
i = 12,  loss = 0.20072267949581146,  accuracy = 0.90625
i = 14,  loss = 0.2631858289241791,  accuracy = 0.890625
i = 16,  loss = 0.40339550375938416,  accuracy = 0.859375
i = 18,  loss = 0.3499709963798523,  accuracy = 0.84375
i = 20,  loss = 0.2183709442615509,  accuracy = 0.953125
i = 22,  loss = 0.2717897891998291,  accuracy = 0.9375
i = 24,  loss = 0.1825321763753891,  accuracy = 0.9375
i = 26,  loss = 0.18134421110153198,  accuracy = 0.953125
i = 28,  loss = 0.33332014083862305,  accuracy = 0.890625
i = 30,  loss = 0.3157309591770172,  accuracy = 0.875
i = 32,  loss = 0.2941773533821106,  accuracy = 0.921875
i = 34,  loss = 0.298532634973526,  accurac

KeyboardInterrupt: 

In [14]:
# 保存模型， 请谨慎操作， 会覆盖文件中的模型
torch.save(convNet.state_dict(), './ConvNet.model')

In [13]:
val_loader = data.DataLoader(ImageSet('val'), batch_size=50, shuffle=False)

convNet.eval()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
convNet = convNet.to(device)

for i, (imgs, lbs) in enumerate(val_loader):
    # print(int(round(time.time() * 1000)))
    imgs = imgs.float().to(device)
    lbs = lbs.to(device)
    outputs = convNet(imgs)
    predict = torch.argmax(F.softmax(outputs, dim=1), dim=1)
    # print(int(round(time.time() * 1000)))
    if i % 1 == 0:
        print(f"i = {i}, \n lables = {lbs}, \n predict = {predict}  \n accuracy = {float(sum(lbs == predict))/float(lbs.size(0))}")

i = 0, 
 lables = tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3,
        4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7,
        8, 9], device='cuda:0'), 
 predict = tensor([0, 1, 2, 3, 4, 5, 6, 7, 5, 9, 0, 1, 2, 3, 4, 5, 6, 6, 5, 9, 0, 1, 2, 3,
        4, 5, 6, 7, 5, 9, 0, 1, 2, 0, 4, 5, 9, 7, 8, 9, 0, 6, 8, 3, 4, 5, 0, 7,
        5, 9], device='cuda:0')  
 accuracy = 0.8
i = 1, 
 lables = tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3,
        4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7,
        8, 9], device='cuda:0'), 
 predict = tensor([0, 0, 2, 3, 4, 4, 5, 7, 8, 9, 0, 1, 5, 3, 4, 5, 6, 7, 6, 9, 0, 1, 2, 3,
        4, 5, 9, 7, 8, 9, 0, 6, 4, 3, 4, 5, 6, 7, 9, 9, 0, 1, 5, 3, 4, 5, 6, 7,
        5, 9], device='cuda:0')  
 accuracy = 0.78


In [15]:
from STFT import STFT
audio_test_files = glob.glob(f'./dataset/task2/test/*/*.pkl')
class TestSet(data.Dataset):
    def __init__(self):
        self.length = len(audio_test_files)

    def __getitem__(self, index):
        audio_file = audio_test_files[index]
        audio_map = STFT(audio_file,is_plot, freq_length, time_length)
        return audio_map, audio_file

    def __len__(self):
        return self.length

test_loader = data.DataLoader(TestSet(), batch_size=batch_size, shuffle=False)

In [16]:
convNet.eval()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
convNet = convNet.to(device)
predict_all = []
files_all = []
for i, (imgs, files) in enumerate(test_loader):
    # print(int(round(time.time() * 1000)))
    imgs = imgs.float().to(device)
    outputs = convNet(imgs)
    predict = torch.argmax(F.softmax(outputs, dim=1), dim=1)
    # print(int(round(time.time() * 1000)))
    print(f"""i = {i},  
    predict = {predict}""")
    predict_all += predict.tolist()
    files_all += list(files)
print(files_all)
print(predict_all)

i = 0,  
    predict = tensor([6, 7, 1, 5, 0, 1, 3, 5, 9, 7, 9, 9, 1, 3, 2, 3, 1, 0, 9, 5, 9, 0, 2, 3,
        8, 8, 6, 8, 6, 2, 1, 4, 1, 7, 6, 3, 0, 8, 2, 9, 4, 3, 0, 0, 3, 9, 5, 8,
        1, 2, 7, 3, 9, 9, 1, 3, 9, 5, 6, 7, 1, 0, 0, 6], device='cuda:0')
i = 1,  
    predict = tensor([4, 4, 1, 3, 2, 8, 8, 6, 4, 4, 9, 5, 9, 9, 4, 2, 5, 7, 8, 6, 3, 9, 0, 1,
        9, 7, 9, 3, 3, 2, 2, 5, 2, 6, 0, 0, 7, 9, 4, 1, 5, 5, 3, 9, 7, 5, 1, 5,
        0, 6, 7, 1, 8, 9, 8, 6, 7, 2, 5, 0, 7, 3, 9, 8], device='cuda:0')
i = 2,  
    predict = tensor([4, 5, 9, 4, 1, 0, 9, 5, 3, 6, 6, 9, 0, 9, 6, 8, 0, 2, 3, 6, 9, 5, 3, 9,
        9, 8, 8, 6, 9, 0, 3, 7, 7, 7, 6, 2, 7, 2, 0, 0, 0, 8, 1, 6, 2, 0, 0, 9,
        3, 1, 6, 3, 5, 3, 4, 4, 5, 7, 7, 3, 3, 3, 1, 5], device='cuda:0')
i = 3,  
    predict = tensor([2, 0, 9, 7, 9, 3, 2, 0, 4, 8, 7, 0, 1, 3, 1, 2, 9, 3, 2, 1, 6, 1, 0, 2,
        4, 2, 8, 7, 5, 7, 5, 8, 5, 2, 3, 6, 6, 6, 7, 6, 9, 7, 2, 5, 1, 3, 9, 7,
        5, 3, 7, 7, 2, 5, 9, 9, 1, 9, 0, 4, 7,