In [1]:
import os
import sys
sys.path.append(r'utils')
import numpy as np
import soundfile
import librosa
from scipy import signal

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim

from utilities import (read_audio, repeat_seq, scale)
import config

from models_pytorch import move_data_to_gpu, Vggish
from data_generator import DataGenerator

In [2]:
Model = Vggish

batch_size = 64
time_steps = 128

sample_rate = config.sample_rate
window_size = config.window_size
overlap = config.overlap
mel_bins = config.mel_bins

## 1.提取音频特征

### 参数：音频位置，截取位置

In [3]:
def calculate_logmel(audio_path, sample_rate, extractor, audio_cuts):

    (audio, _) = read_audio(audio_path, target_fs=sample_rate)

    [cut1,cut2] = audio_cuts.split(',')
    audio = audio[int(float(cut1)*sample_rate):int(float(cut2)*sample_rate)]

    audio = audio / np.max(np.abs(audio))

    feature = extractor.transform(audio)

    return feature

class LogMelExtractor():
    def __init__(self, sample_rate, window_size, overlap, mel_bins):
        
        self.window_size = window_size
        self.overlap = overlap
        self.ham_win = np.hamming(window_size)
        
        self.melW = librosa.filters.mel(sr=sample_rate, 
                                        n_fft=window_size, 
                                        n_mels=mel_bins, 
                                        fmin=50., 
                                        fmax=sample_rate // 2).T
    
    def transform(self, audio):
    
        ham_win = self.ham_win
        window_size = self.window_size
        overlap = self.overlap
    
        [f, t, x] = signal.spectral.spectrogram(
                        audio, 
                        window=ham_win,
                        nperseg=window_size, 
                        noverlap=overlap, 
                        detrend=False, 
                        return_onesided=True, 
                        mode='magnitude') 
        x = x.T
            
        x = np.dot(x, self.melW)
        x = np.log(x + 1e-8)
        x = x.astype(np.float32)
        
        return x
            
extractor = LogMelExtractor(sample_rate=sample_rate,
                            window_size=window_size,
                            overlap=overlap,
                            mel_bins=mel_bins)

feature = calculate_logmel("E:/毕设/乐器音频/三弦T0289/T0289A2-1c2-1.wav", sample_rate, extractor, "74.1,76.1")

print(feature.shape)

(85, 64)


## 2.特征识别

In [4]:
# 归一化
mean_ = np.load('mean_.npy')
std_  = np.load('std_.npy')

# 数据调整
audio_input = repeat_seq(scale(feature, mean_, std_), time_steps)
audio_input = audio_input[np.newaxis]

num_classes = len(config.labels)
model = Model(num_classes)
        
checkpoint = torch.load("md_3000_iters.tar")
model.load_state_dict(checkpoint['state_dict'])

model.cuda()
model.eval()

x_ = move_data_to_gpu(audio_input, 1)
y_ = model(x_)
y_ = y_.data.cpu().numpy()

res = np.argsort(y_)
# 预测第一
print(config.labels[np.argmax(y_)])
# 预测前三
print(config.labels[res[0][-1]], config.labels[res[0][-2]], config.labels[res[0][-3]])

三弦T0289
三弦T0289 澜沧小三弦T0300 二胡L0266


# 3.总函数

### （1）直接调用get_audio_result函数，参数为音频位置，音频切割点
### （2）最好放在一个.py文件中，直接新建拷贝即可
### （3）在服务器上运行时，要注意识别时间和运行环境
### （4）最主要的环境是torch==1.1.0  librosa==0.6.3

In [6]:
import os
import sys
sys.path.append(r'utils')
import numpy as np
import soundfile
import librosa
from scipy import signal

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim

from utilities import (read_audio, repeat_seq, scale)
import config

from models_pytorch import move_data_to_gpu, Vggish
from data_generator import DataGenerator

Model = Vggish

batch_size = 64
time_steps = 128

sample_rate = config.sample_rate
window_size = config.window_size
overlap = config.overlap
mel_bins = config.mel_bins

# 乐器种类
num_classes = len(config.labels)
model = Model(num_classes)

# 读取模型
checkpoint = torch.load("md_3000_iters.tar")
model.load_state_dict(checkpoint['state_dict'])

# 注意环境
model.cuda()
model.eval()

def calculate_logmel(audio_path, sample_rate, extractor, audio_cuts):

    (audio, _) = read_audio(audio_path, target_fs=sample_rate)
    
    if audio_cuts is not None:
        [cut1,cut2] = audio_cuts.split(',')
        audio = audio[int(float(cut1)*sample_rate):int(float(cut2)*sample_rate)]

    audio = audio / np.max(np.abs(audio))

    feature = extractor.transform(audio)

    return feature

class LogMelExtractor():
    def __init__(self, sample_rate, window_size, overlap, mel_bins):
        
        self.window_size = window_size
        self.overlap = overlap
        self.ham_win = np.hamming(window_size)
        
        self.melW = librosa.filters.mel(sr=sample_rate, 
                                        n_fft=window_size, 
                                        n_mels=mel_bins, 
                                        fmin=50., 
                                        fmax=sample_rate // 2).T
    
    def transform(self, audio):
    
        ham_win = self.ham_win
        window_size = self.window_size
        overlap = self.overlap
    
        [f, t, x] = signal.spectral.spectrogram(
                        audio, 
                        window=ham_win,
                        nperseg=window_size, 
                        noverlap=overlap, 
                        detrend=False, 
                        return_onesided=True, 
                        mode='magnitude') 
        x = x.T
            
        x = np.dot(x, self.melW)
        x = np.log(x + 1e-8)
        x = x.astype(np.float32)
        
        return x
            
extractor = LogMelExtractor(sample_rate=sample_rate,
                            window_size=window_size,
                            overlap=overlap,
                            mel_bins=mel_bins)


def get_audio_result(name, cut = None):
    # 特征
    feature = calculate_logmel(name, sample_rate, extractor, cut)
    
    # 归一化
    mean_ = np.load('mean_.npy')
    std_  = np.load('std_.npy')

    # 数据调整
    audio_input = repeat_seq(scale(feature, mean_, std_), time_steps)
    audio_input = audio_input[np.newaxis]

    x_ = move_data_to_gpu(audio_input, 1)
    y_ = model(x_)
    y_ = y_.data.cpu().numpy()

    res = np.argsort(y_)
    # 预测第一
    print(config.labels[np.argmax(y_)])
    # 预测前三
    print(config.labels[res[0][-1]], config.labels[res[0][-2]], config.labels[res[0][-3]])
    
    # 返回值可自己调整
    return config.labels[np.argmax(y_)]

print(get_audio_result("E:/毕设/乐器音频/A调曲笛C0280/C0280A2-1c3-2.wav", "33.81,35.53"))
print("\n")
print(get_audio_result("E:/毕设/乐器音频/三弦T0289/T0289A2-1c2-1.wav", "74.1,76.1"))
print("\n")
print(get_audio_result("E:/毕设/乐器音频/G调梆笛C0237/C0237A2-1c1-1.wav", "6.69,7.38"))
print("\n")
print(get_audio_result("E:/毕设/乐器音频/中音加键唢呐C0259/C0259A2-1c2-2.wav", "6.28,7.58"))
print("\n")
print(get_audio_result("E:/毕设/乐器音频/中音笙C0263/C0263A2-1c2-1.wav", "42.91,44.91"))

A调曲笛C0280
A调曲笛C0280 雷琴L0256 箫C0282
A调曲笛C0280


三弦T0289
三弦T0289 澜沧小三弦T0300 二胡L0266
三弦T0289


G调梆笛C0237
G调梆笛C0237 椰胡L0288 雷琴L0256
G调梆笛C0237


中音加键唢呐C0259
中音加键唢呐C0259 低音加键唢呐C0257 洞巴C0304
中音加键唢呐C0259


中音笙C0263
中音笙C0263 低音笙C0264 高音键笙C0243
中音笙C0263


# 4.所有乐器种类

In [6]:
config.labels

['A调曲笛C0280',
 'G调新笛C0281',
 'G调梆笛C0237',
 '三弦T0289',
 '中国大鼓D0248',
 '中胡L0297',
 '中虎D0273',
 '中阮T0260',
 '中音加键唢呐C0259',
 '中音板胡L0240',
 '中音笙C0263',
 '二弦L0285',
 '二胡L0266',
 '云锣D0279',
 '五音排鼓D0252',
 '传统笙C0244',
 '低音加键唢呐C0257',
 '低音笙C0264',
 '侗笛C0316',
 '俄比C0311',
 '傈僳族奇奔T0302',
 '六角高胡L0292',
 '北梆子D0246',
 '南梆子D0245',
 '南音三弦T0295',
 '南音二弦L0293',
 '南音琵琶T0294',
 '古筝T0255',
 '吐良C0308',
 '唢呐C0296',
 '埙C0283',
 '大镲D0290',
 '小堂鼓D0250',
 '小锣D0275',
 '小镲D0271',
 '小闷笛C0303',
 '巴乌C0310',
 '德C0305',
 '扁八角高胡L0291',
 '扁鼓D0251',
 '扬琴T0267',
 '抄锣D0272',
 '拉祜族葫芦笙C0306',
 '斗锣D0284',
 '曲锣D0286',
 '木鱼D0277',
 '板鼓D0278',
 '柳琴T0261',
 '椰胡L0288',
 '武锣D0274',
 '洞巴C0304',
 '深波D0287',
 '澜沧小三弦T0300',
 '牛角琴L0312',
 '牛铃D0299',
 '独弦琴T0317',
 '独弦胡L0313',
 '玎T0301',
 '琵琶T0262',
 '碰铃D0247',
 '竹排琴D0315',
 '箜篌T0254',
 '管子C0265',
 '箫C0282',
 '编磬D0242',
 '编钟D0241',
 '编铓D0298',
 '芦笙C0307',
 '花盆鼓D0249',
 '草帽镲D0268',
 '葫芦丝C0309',
 '葫芦琴L0314',
 '铙D0269',
 '铙钹D0270',
 '雷琴L0256',
 '马锣D0276',
 '高音板胡L0239',
 '高音键笙C0243']