In [1]:
import os
import numpy as np
import soundfile
import librosa
import h5py
import pandas as pd
from scipy import signal
import matplotlib.pyplot as plt

In [8]:
from PIL import Image
from sklearn.model_selection import train_test_split

In [6]:
import torch
from torch.utils.data import Dataset,DataLoader,TensorDataset
from torch.autograd import Variable

In [4]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim

In [5]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. 
    Ref: He, Kaiming, et al. "Delving deep into rectifiers: Surpassing 
    human-level performance on imagenet classification." Proceedings of the 
    IEEE international conference on computer vision. 2015.
    """
    
    if layer.weight.ndimension() == 4:
        (n_out, n_in, height, width) = layer.weight.size()
        n = n_in * height * width
        
    elif layer.weight.ndimension() == 2:
        (n_out, n) = layer.weight.size()

    std = math.sqrt(2. / n)
    scale = std * math.sqrt(3.)
    layer.weight.data.uniform_(-scale, scale)

    if layer.bias is not None:
        layer.bias.data.fill_(0.)


def init_bn(bn):
    """Initialize a Batchnorm layer. """
    
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)    

class VggishConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        
        super(VggishConvBlock, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=in_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.conv2 = nn.Conv2d(in_channels=out_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.bn1 = nn.BatchNorm2d(out_channels)
        
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        self.init_weights()
        
    def init_weights(self):
        
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)
        
    def forward(self, input):
        
        x = input
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        
        return x
    
    
class Vggish(nn.Module):
    def __init__(self, classes_num):
        
        super(Vggish, self).__init__()

        self.conv_block1 = VggishConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = VggishConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = VggishConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = VggishConvBlock(in_channels=256, out_channels=512)

        self.fc_final = nn.Linear(512, classes_num, bias=True)

        self.init_weights()

    def init_weights(self):

        init_layer(self.fc_final)

    def forward(self, input, return_bottleneck=False):
        (_, seq_len, mel_bins) = input.shape

        x = input.view(-1, 1, seq_len, mel_bins)
        '''(samples_num, feature_maps, time_steps, freq_num)'''

        x = self.conv_block1(x)
        x = F.max_pool2d(x, kernel_size=(2, 2), stride=(2, 2))
        x = self.conv_block2(x)
        x = F.max_pool2d(x, kernel_size=(2, 2), stride=(2, 2))
        x = self.conv_block3(x)
        x = F.max_pool2d(x, kernel_size=(2, 2), stride=(2, 2))
        x = self.conv_block4(x)
        x = F.max_pool2d(x, kernel_size=(2, 2), stride=(2, 2))

        x = F.max_pool2d(x, kernel_size=x.shape[2:])
        x = x.view(x.shape[0:2])

        x = F.log_softmax(self.fc_final(x), dim=-1)

        return x

In [22]:
from tensorboardX import SummaryWriter
def AudiotTrain(name):
    writer = SummaryWriter()
    print("********************************************************")
    print("待训练乐器:"+name)
    df = pd.DataFrame(pd.read_excel(name+'.xlsx'))
    audioname = []
    audiofeature = []
    num = 0
    audioset = {}
    n = df.shape[0]
    for i in range(n):
        if i is not 0:
            if df.iloc[i,2] is not df.iloc[i-1,2]:
                num += 1
        audioname.append(num)
        audioset[str(df.iloc[i,2])] = num

    audiofeature = np.load(name+'.npy',allow_pickle=True)
    
    features  = []
    for fea in audiofeature:
        new_img = Image.fromarray(fea.astype(np.float32))
        new_img = new_img.resize((128,64))
        fea = np.array(new_img.getdata(),dtype=np.float32).reshape(128,64)
        features.append(fea)
    
#     print(audioname[-1])
    X = np.array(features)
    y = np.array(audioname)
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=9)
    
    x_data = torch.from_numpy(x_train)
    y_data = torch.from_numpy(y_train)
    print("train------",x_data.size(),y_data.size())

    train_dataset = TensorDataset(x_data,y_data)
    train_loader = DataLoader(dataset=train_dataset,batch_size=32,shuffle=True,num_workers=2)

    x_ = torch.from_numpy(x_test)
    y_ = torch.from_numpy(y_test)
    print("test------",x_.size(),y_.size())

    test_dataset = TensorDataset(x_,y_)
    test_loader = DataLoader(dataset=test_dataset,batch_size=32,shuffle=True,num_workers=2)


    model = Vggish(audioname[-1]+1)
    model.cuda()

    optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.999), 
                               eps=1e-08, weight_decay=0.)

    for epoch in range(100):
        correct = torch.zeros(1).squeeze().cuda()
        total = torch.zeros(1).squeeze().cuda()
        for i,data in enumerate(train_loader):
            inputs,labels = data
            inputs = Variable(inputs).cuda()
            labels = Variable(labels.long()).cuda()

            model.train()
            output = model(inputs)
            loss = F.nll_loss(output,labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            prediction = torch.argmax(output, 1)
            correct += (prediction == labels).sum().float()
            total += len(labels)

        print("epoch:",epoch,"loss:",loss.data.item()) 
        print('Accuracy: %f'%((correct/total).cpu().detach().data.numpy()))
        writer.add_scalar('scalar/Train',(correct/total).cpu().detach().data.numpy(),epoch)
        writer.add_scalar('scalar/Loss',loss.data.item(),epoch)

        if epoch%2==0:
            test_correct = torch.zeros(1).squeeze().cuda()
            test_total = torch.zeros(1).squeeze().cuda()
            for i,data in enumerate(test_loader):
                inputs,labels = data
                inputs = inputs.cuda()
                labels = labels.long().cuda()
                model.eval()
                output = model(inputs)
                test_prediction = torch.argmax(output, 1)
                test_correct += (test_prediction == labels).sum().float()
                test_total += len(labels)
            print('Test Accuracy: %f'%((test_correct/test_total).cpu().detach().data.numpy()))
            writer.add_scalar('scalar/Test',(test_correct/test_total).cpu().detach().data.numpy(),epoch)
            print('---------------------------------------')

In [23]:
AudiotTrain('吹奏类')
AudiotTrain('弹拨类')
AudiotTrain('拉弦类')
AudiotTrain('敲击类')

********************************************************
待训练乐器:吹奏类
train------ torch.Size([1673, 128, 64]) torch.Size([1673])
test------ torch.Size([419, 128, 64]) torch.Size([419])
epoch: 0 loss: 1.954062819480896
Accuracy: 0.472803
Test Accuracy: 0.589499
---------------------------------------
epoch: 1 loss: 1.2599797248840332
Accuracy: 0.698745
epoch: 2 loss: 0.5806149244308472
Accuracy: 0.792588
Test Accuracy: 0.770883
---------------------------------------
epoch: 3 loss: 0.4160209894180298
Accuracy: 0.873282
epoch: 4 loss: 0.25341060757637024
Accuracy: 0.931859
Test Accuracy: 0.830549
---------------------------------------
epoch: 5 loss: 0.16231171786785126
Accuracy: 0.939032
epoch: 6 loss: 0.1214364618062973
Accuracy: 0.958159
Test Accuracy: 0.887828
---------------------------------------
epoch: 7 loss: 0.02586285211145878
Accuracy: 0.974895
epoch: 8 loss: 0.02784273400902748
Accuracy: 0.987448
Test Accuracy: 0.902148
---------------------------------------
epoch: 9 loss: 0.0

epoch: 91 loss: 0.0051502650603652
Accuracy: 1.000000
epoch: 92 loss: 0.0010260476265102625
Accuracy: 1.000000
Test Accuracy: 0.945107
---------------------------------------
epoch: 93 loss: 0.0010974672622978687
Accuracy: 1.000000
epoch: 94 loss: 0.00013955433678347617
Accuracy: 1.000000
Test Accuracy: 0.945107
---------------------------------------
epoch: 95 loss: 0.0003258387150708586
Accuracy: 1.000000
epoch: 96 loss: 0.0006194644374772906
Accuracy: 1.000000
Test Accuracy: 0.945107
---------------------------------------
epoch: 97 loss: 0.0025151569861918688
Accuracy: 1.000000
epoch: 98 loss: 0.00010734134411904961
Accuracy: 1.000000
Test Accuracy: 0.945107
---------------------------------------
epoch: 99 loss: 0.00015841590357013047
Accuracy: 1.000000
********************************************************
待训练乐器:弹拨类
train------ torch.Size([992, 128, 64]) torch.Size([992])
test------ torch.Size([248, 128, 64]) torch.Size([248])
epoch: 0 loss: 1.789534568786621
Accuracy: 0.369960

Test Accuracy: 0.822581
---------------------------------------
epoch: 83 loss: 7.355213165283203e-05
Accuracy: 1.000000
epoch: 84 loss: 0.00011113286018371582
Accuracy: 1.000000
Test Accuracy: 0.822581
---------------------------------------
epoch: 85 loss: 8.603930473327637e-05
Accuracy: 1.000000
epoch: 86 loss: 8.693337440490723e-05
Accuracy: 1.000000
Test Accuracy: 0.818548
---------------------------------------
epoch: 87 loss: 6.219744682312012e-05
Accuracy: 1.000000
epoch: 88 loss: 0.00010603666305541992
Accuracy: 1.000000
Test Accuracy: 0.822581
---------------------------------------
epoch: 89 loss: 6.645917892456055e-05
Accuracy: 1.000000
epoch: 90 loss: 0.0001278519630432129
Accuracy: 1.000000
Test Accuracy: 0.822581
---------------------------------------
epoch: 91 loss: 6.976723670959473e-05
Accuracy: 1.000000
epoch: 92 loss: 6.607174873352051e-05
Accuracy: 1.000000
Test Accuracy: 0.822581
---------------------------------------
epoch: 93 loss: 9.000301361083984e-05
Accura

epoch: 73 loss: 0.00018262863159179688
Accuracy: 1.000000
epoch: 74 loss: 0.00012822584540117532
Accuracy: 1.000000
Test Accuracy: 0.878641
---------------------------------------
epoch: 75 loss: 0.00010932575241895393
Accuracy: 1.000000
epoch: 76 loss: 0.000125885009765625
Accuracy: 1.000000
Test Accuracy: 0.883495
---------------------------------------
epoch: 77 loss: 6.476315320469439e-05
Accuracy: 1.000000
epoch: 78 loss: 6.996501906542107e-05
Accuracy: 1.000000
Test Accuracy: 0.883495
---------------------------------------
epoch: 79 loss: 0.0003718896477948874
Accuracy: 1.000000
epoch: 80 loss: 7.399645983241498e-05
Accuracy: 1.000000
Test Accuracy: 0.883495
---------------------------------------
epoch: 81 loss: 7.607720181113109e-05
Accuracy: 1.000000
epoch: 82 loss: 0.0001012628708849661
Accuracy: 1.000000
Test Accuracy: 0.883495
---------------------------------------
epoch: 83 loss: 0.00014894659398123622
Accuracy: 1.000000
epoch: 84 loss: 7.208911119960248e-05
Accuracy: 1.

Test Accuracy: 0.935484
---------------------------------------
epoch: 65 loss: 0.0022593140602111816
Accuracy: 0.997984
epoch: 66 loss: 0.00328981876373291
Accuracy: 0.997984
Test Accuracy: 0.919355
---------------------------------------
epoch: 67 loss: 0.0006906390190124512
Accuracy: 0.995968
epoch: 68 loss: 0.0010520219802856445
Accuracy: 1.000000
Test Accuracy: 0.959677
---------------------------------------
epoch: 69 loss: 0.002057373523712158
Accuracy: 1.000000
epoch: 70 loss: 0.00016576051712036133
Accuracy: 1.000000
Test Accuracy: 0.959677
---------------------------------------
epoch: 71 loss: 0.003967165946960449
Accuracy: 1.000000
epoch: 72 loss: 0.0004730224609375
Accuracy: 1.000000
Test Accuracy: 0.959677
---------------------------------------
epoch: 73 loss: 0.0009033083915710449
Accuracy: 1.000000
epoch: 74 loss: 0.0001952052116394043
Accuracy: 1.000000
Test Accuracy: 0.959677
---------------------------------------
epoch: 75 loss: 0.000291287899017334
Accuracy: 1.000

In [19]:
AudiotTrain('吹奏类')
AudiotTrain('弹拨类')
AudiotTrain('拉弦类')
AudiotTrain('敲击类')

********************************************************
待训练乐器:吹奏类
train------ torch.Size([1673, 128, 64]) torch.Size([1673])
test------ torch.Size([419, 128, 64]) torch.Size([419])
epoch: 0 loss: 2.184091329574585
Accuracy: 0.446503
Test Accuracy: 0.591885
---------------------------------------
epoch: 1 loss: 1.3778656721115112
Accuracy: 0.711297
epoch: 2 loss: 1.7568081617355347
Accuracy: 0.806336
Test Accuracy: 0.727924
---------------------------------------
epoch: 3 loss: 0.5414739847183228
Accuracy: 0.867304
epoch: 4 loss: 0.2738460302352905
Accuracy: 0.896593
Test Accuracy: 0.844869
---------------------------------------
epoch: 5 loss: 0.08741193264722824
Accuracy: 0.936043
epoch: 6 loss: 0.3705540895462036
Accuracy: 0.966527
Test Accuracy: 0.871122
---------------------------------------
epoch: 7 loss: 0.13366205990314484
Accuracy: 0.965929
epoch: 8 loss: 0.05641788989305496
Accuracy: 0.969516
Test Accuracy: 0.887828
---------------------------------------
epoch: 9 loss: 0.01

epoch: 40 loss: 0.00022861361503601074
Accuracy: 1.000000
Test Accuracy: 0.826613
---------------------------------------
epoch: 41 loss: 0.0016625523567199707
Accuracy: 1.000000
epoch: 42 loss: 0.00042057037353515625
Accuracy: 1.000000
Test Accuracy: 0.830645
---------------------------------------
epoch: 43 loss: 0.002305924892425537
Accuracy: 1.000000
epoch: 44 loss: 0.00033739209175109863
Accuracy: 1.000000
Test Accuracy: 0.826613
---------------------------------------
epoch: 45 loss: 0.00022903084754943848
Accuracy: 1.000000
epoch: 46 loss: 0.00043192505836486816
Accuracy: 1.000000
Test Accuracy: 0.834677
---------------------------------------
epoch: 47 loss: 0.00039499998092651367
Accuracy: 1.000000
epoch: 48 loss: 0.0002480447292327881
Accuracy: 1.000000
Test Accuracy: 0.838710
---------------------------------------
epoch: 49 loss: 0.00026488304138183594
Accuracy: 1.000000
********************************************************
待训练乐器:拉弦类
train------ torch.Size([822, 128, 64]

epoch: 30 loss: 0.00022554397583007812
Accuracy: 1.000000
Test Accuracy: 0.959677
---------------------------------------
epoch: 31 loss: 0.0005393028259277344
Accuracy: 1.000000
epoch: 32 loss: 0.00011974573135375977
Accuracy: 1.000000
Test Accuracy: 0.959677
---------------------------------------
epoch: 33 loss: 0.003724813461303711
Accuracy: 1.000000
epoch: 34 loss: 0.0004897117614746094
Accuracy: 1.000000
Test Accuracy: 0.967742
---------------------------------------
epoch: 35 loss: 0.0021930336952209473
Accuracy: 1.000000
epoch: 36 loss: 0.0010750293731689453
Accuracy: 1.000000
Test Accuracy: 0.959677
---------------------------------------
epoch: 37 loss: 0.0006386041641235352
Accuracy: 1.000000
epoch: 38 loss: 0.00044852495193481445
Accuracy: 1.000000
Test Accuracy: 0.967742
---------------------------------------
epoch: 39 loss: 0.00046133995056152344
Accuracy: 1.000000
epoch: 40 loss: 0.0005074143409729004
Accuracy: 1.000000
Test Accuracy: 0.967742
--------------------------