In [2]:
import h5py
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import librosa.display as display
import librosa
import IPython.display as ipd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch.utils.data import random_split, DataLoader, Dataset
import torchaudio.transforms as T
import torchvision
from tqdm.notebook import tqdm
import random

In [4]:
f = h5py.File('D:/liuchaochao/桌面/大三下/bird/BirdVox-70k_unit01.hdf5', 'r')
waveforms = f['waveforms']
for i in waveforms.keys():
    print(i)
#     break

unit01_000023161_00000_0
unit01_000226742_00000_0
unit01_000442514_00000_0
unit01_000466895_00000_0
unit01_000548571_00000_0
unit01_000816761_00000_0
unit01_001483580_00000_0
unit01_001499428_00000_0
unit01_002972038_00000_0
unit01_003190247_00000_0
unit01_003463314_00000_0
unit01_006846171_00000_0
unit01_007205790_00000_0
unit01_007292342_00000_0
unit01_007403276_00000_0
unit01_007445942_00000_0
unit01_007488609_00000_0
unit01_007508114_00000_0
unit01_007622704_00000_0
unit01_007710476_00000_0
unit01_007867733_00000_0
unit01_007928685_00000_0
unit01_009459809_00000_0
unit01_009965714_00000_0
unit01_010053485_00000_0
unit01_010075428_00000_0
unit01_010443580_00000_0
unit01_010556952_00000_0
unit01_010654476_00000_0
unit01_010691927_04779_1
unit01_010758144_04727_1
unit01_010827580_00000_0
unit01_011240838_00000_0
unit01_011287161_00000_0
unit01_011662628_00000_0
unit01_011978361_00000_0
unit01_012008838_00000_0
unit01_012572038_00000_0
unit01_013080380_00000_0
unit01_013276647_00000_0


In [5]:
sample = waveforms['unit01_929181552_08000_1']
sample.shape

(12000,)

In [None]:
sample_array = np.array(sample)
display.waveshow(sample_array, sr=24000, )

In [None]:
plt.figure(figsize=(15,3))
plt.subplot(1,2,1)
plt.plot(sample_array)
plt.title('Full Waveform')
plt.subplot(1,2,2)
plt.title('Waveform Snippet')
plt.plot(sample_array[:200])
# plt.savefig('waveform.jpg')

In [None]:
ipd.Audio(sample_array, rate=24000)

In [None]:
sample_spec = librosa.feature.melspectrogram(sample_array, sr=24000)
display.specshow(librosa.core.power_to_db(sample_spec, ref=np.max), sr=24000,
                 x_axis='ms', y_axis='mel')
plt.show()

In [None]:
sample_spec.shape

# 准备工作
## 频谱图可视化和音频聆听样本

In [None]:
def show_samples(positive_labels=True):
    if positive_labels:
        l = '1'
    else:
        l = '0'
    sr = 24000
    c = 1
    plt.figure(figsize=(28, 8), dpi=120)
    for i in waveforms.keys():
        if c > 8:
            break
        if i[-1] == l:
            sa = np.array(waveforms[i])
            s_spec = librosa.feature.melspectrogram(sa, sr)
            plt.subplot(2, 4, c)
            display.specshow(librosa.core.power_to_db(s_spec, ref=np.max),
                             sr=sr, x_axis='time', y_axis='mel')
            c += 1
    plt.tight_layout()
    plt.savefig('spec.png')
    plt.show()

In [None]:
show_samples()

In [None]:
show_samples(positive_labels=False)

In [None]:
def hear_samples(positive_labels=True):
    if positive_labels:
        l='1'
    else:
        l='0'
    c = 1
    for i in waveforms.keys():
        if c > 3:
            break
        if i[-1] == l:
            sa = np.array(waveforms[i])
            print(i)
            ipd.display(ipd.Audio(sa, rate=24000))
            c += 1

In [None]:
hear_samples()

In [None]:
hear_samples(False)

## 数据加载

In [None]:
sample_tensor = torch.Tensor(sample_array)
sample_tensor_spec = T.MelSpectrogram(sample_rate=24000,
                                      n_fft=2048,
                                      hop_length=512)(sample_tensor)
display.specshow(librosa.core.power_to_db(sample_tensor_spec.numpy(),
                                          ref=np.max),
                 sr=24000, x_axis='ms', y_axis='mel')
plt.show()

In [None]:
display.specshow(librosa.core.power_to_db(sample_spec, ref=np.max), sr=24000,
                 x_axis='ms', y_axis='mel')
plt.show()

In [None]:
display.specshow(librosa.core.power_to_db(sample_spec, ref=np.max), sr=24000,
                 x_axis='ms', y_axis='mel')
plt.show()

In [None]:
root_dir = '/notebooks/storage/'
fnames = ['BirdVox-70k_unit03.hdf5',
          'BirdVox-70k_unit07.hdf5',
          'BirdVox-70k_unit10.hdf5']

In [None]:
class BirdVox70kDS(Dataset):
    def __init__(self, root_dir, fnames, transforms=None):
        # store transforms func
        self.transforms = transforms
        # initialize storage arrays
        self.wave_loc = []
        self.labels = []

        # for each hdf5 file...
        for fname in fnames:
            # open the file
            fhdf5 = os.path.join(root_dir, fname)
            with h5py.File(fhdf5, 'r') as f:
                # navigate to `waveforms` group
                waveforms = f['waveforms']
                # for each piece of data...
                for waveform in waveforms.keys():
                    # append waveform filename for later access
                    self.wave_loc.append([fhdf5, waveform])
                    # (label == last digit of filename)
                    self.labels.append(waveform[-1])

        # turn them into np.arrays
        self.wave_loc = np.array(self.wave_loc)
        self.labels = np.array(self.labels)

        # melspec transform (similar to `librosa.feature.melspectrogram()`)
        self.melspec = T.MelSpectrogram(sample_rate=24000,
                                        n_fft=2048,
                                        hop_length=512)

    def __len__(self):
        # size of dataset
        return len(self.labels)

    def __getitem__(self, idx):
        # fetch waveform from hdf5 file & label
        fhdf5, waveform = self.wave_loc[idx]
        with h5py.File(fhdf5, 'r') as f:
            wave = f['waveforms'][waveform]
            # convert to np array for faster i/o performance
            # ^^ https://github.com/pytorch/pytorch/issues/28761
            wave = np.array(wave)
            # apply other specified transforms
            if self.transforms:
                wave = self.transforms()(wave)
            # convert into tensor & apply melspec
            wave = self.melspec(torch.Tensor(wave))
            # unsqueeze adds dimension needed for pytorch's `Conv2d`
            wave = wave.unsqueeze(0)
        # parse label (still a string)
        label = self.labels[idx]
        return wave, int(label)

In [None]:
train_ds = BirdVox70kDS(root_dir, fnames)
val_ds = BirdVox70kDS(root_dir, ['BirdVox-70k_unit01.hdf5'])
x, y = train_ds[13945]
print(x.shape, y)  # check to see if it works

In [None]:
batch_size = 128

In [None]:
%%time
for i, j in train_dl:
    print(i.shape, j)
    break

## 更多数据探索

In [None]:
pct_positive = sum([int(i) for i in train_ds.labels]) / len(ds)
pct_positive * 100

### ^^ 50%的音频样本中听到鸟

In [None]:
fnames = [i[1] for i in train_ds.wave_loc]
for i in val_ds.wave_loc:
    fnames.append(i[1])

freq = []
for fname in fnames:
    if fname[-1] != '1': continue
    freq.append(int(fname[-7:-2]))
    
freq[:5], len(freq)
sns.distplot(freq, kde=False)
plt.xlabel('Frequency')
plt.ylabel('Count')
plt.title('# of Bird Vocal Occurances at Various Frequencies')
plt.plot()

### 与我的假设相反，许多鸟类发声的频率低于我的预期（2000-4000 Hz，而不是 6000-8000 Hz）

# 模型
## 模型创建

In [None]:
def accuracy(outs, labels):
    _, preds = torch.max(outs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

In [None]:
class ModelBase(nn.Module):

    # defines mechanism when training each batch in dl
    def train_step(self, batch):
        xb, labels = batch
        outs = self(xb)
        loss = F.cross_entropy(outs, labels)
        return loss

    # similar to `train_step`, but includes acc calculation & detach
    def val_step(self, batch):
        xb, labels = batch
        outs = self(xb)
        loss = F.cross_entropy(outs, labels)
        acc = accuracy(outs, labels)
        return {'loss': loss.detach(), 'acc': acc.detach()}

    # average out losses & accuracies from validation epoch
    def val_epoch_end(self, outputs):
        batch_loss = [x['loss'] for x in outputs]
        batch_acc = [x['acc'] for x in outputs]
        avg_loss = torch.stack(batch_loss).mean()
        avg_acc = torch.stack(batch_acc).mean()
        return {'avg_loss': avg_loss, 'avg_acc': avg_acc}

    # print all data once done
    def epoch_end(self, epoch, avgs, test=False):
        s = 'test' if test else 'val'
        print(f'Epoch #{epoch + 1}, {s}_loss:{avgs["avg_loss"]}, {s}_acc:{avgs["avg_acc"]}')

In [None]:
@torch.no_grad()
def evaluate(model, val_dl):
    # eval mode
    model.eval()
    outputs = [model.val_step(batch) for batch in val_dl]
    return model.val_epoch_end(outputs)


def fit(epochs, lr, model, train_dl, val_dl, opt_func=torch.optim.Adam):
    torch.cuda.empty_cache()
    history = []
    # define optimizer
    optimizer = opt_func(model.parameters(), lr)
    # for each epoch...
    for epoch in range(epochs):
        # training mode
        model.train()
        # (training) for each batch in train_dl...
        for batch in tqdm(train_dl):
            # pass thru model
            loss = model.train_step(batch)
            # perform gradient descent
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # validation
        res = evaluate(model, val_dl)
        # print everything useful
        model.epoch_end(epoch, res, test=False)
        # append to history
        history.append(res)
    return history

In [None]:
class Classifier(ModelBase):
    def __init__(self):
        super().__init__()                                      # 1 x 128 x 24
        self.conv1 = nn.Conv2d(1, 4, kernel_size=3, padding=1)  # 4 x 128 x 24
        self.conv2 = nn.Conv2d(4, 8, kernel_size=3, padding=1)  # 8 x 128 x 24
        self.bm1 = nn.MaxPool2d(2)                              # 8 x 64 x 12
        self.conv3 = nn.Conv2d(8, 8, kernel_size=3, padding=1)  # 8 x 64 x 12
        self.bm2 = nn.MaxPool2d(2)                              # 8 x 32 x 6
        self.fc1 = nn.Linear(8*32*6, 64)
        self.fc2 = nn.Linear(64, 2)

    def forward(self, xb):
        out = F.relu(self.conv1(xb))
        out = F.relu(self.conv2(out))
        out = self.bm1(out)
        out = F.relu(self.conv3(out))
        out = self.bm2(out)
        out = torch.flatten(out, 1)
        out = F.relu(self.fc1(out))
        out = self.fc2(out)
        return out

In [None]:
model = Classifier()

In [None]:
model

## 利用 GPU 和训练模型

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
torch.backends.cudnn.benchmark = True

In [None]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)


class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""

    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [None]:
train_dl = DeviceDataLoader(train_dl, device)
val_dl = DeviceDataLoader(val_dl, device)
model = to_device(Classifier(), device)
lr = 1e-5
epochs = 8
history = [evaluate(model, val_dl)]
history


In [None]:
history += fit(epochs, lr, model, train_dl, val_dl)

In [None]:
history += fit(3, 1e-6, model, train_dl, val_dl)

In [None]:
plt.plot([x['avg_loss'] for x in history])
plt.title('Losses over epochs')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.show()

In [None]:
plt.plot([x['avg_acc'] for x in history])
plt.title('Accuracy over epochs')
plt.xlabel('epochs')
plt.ylabel('acc')
plt.show()

In [None]:
torch.save(model.state_dict(), 'Classifier.pth')

In [None]:
model.load_state_dict(torch.load('Classifier.pth'))

In [None]:
evaluate(model, val_dl)

# 尝试使用随机变换进行训练优化
## 不同的时移

In [None]:
ipd.Audio(sample_array, rate=24000)

In [None]:
ipd.Audio(np.roll(sample_array, -4800), rate=24000)

## 引入背景噪音

In [None]:
ipd.Audio(sample_array, rate=24000)

In [None]:
ipd.Audio(sample_array + 0.002*np.random.randn(12000), rate=24000)

In [None]:
display.specshow(librosa.power_to_db(sample_spec), sr=24000)

In [None]:
display.specshow(librosa.power_to_db(librosa.feature.melspectrogram(sample_array + 0.002*np.random.randn(12000))),
                 sr=24000)

## 创建转换函数

In [None]:
def tfms():
    tfms_list = [
        lambda wave: np.roll(wave, random.randint(-3000, 3000)),
        lambda wave: wave + 0.005*np.random.randn(len(wave)),
        lambda wave: wave
    ]
    return random.choice(tfms_list)

In [None]:
train_dsmk2 = BirdVox70kDS(root_dir, fnames, transforms=tfms)
val_dsmk2 = BirdVox70kDS(root_dir, ['BirdVox-70k_unit01.hdf5'])

In [None]:
train_dlmk2 = DeviceDataLoader(DataLoader(train_dsmk2,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          pin_memory=True), device)
val_dlmk2 = DeviceDataLoader(DataLoader(val_dsmk2,
                                        batch_size=batch_size,
                                        pin_memory=True), device)

## 训练 Mk2

In [None]:
modelmk2 = to_device(Classifier(), device)
historymk2 = [evaluate(modelmk2, val_dlmk2)]
historymk2

In [None]:
lr = 1e-5
epochs = 5
historymk2 += fit(epochs, lr, modelmk2, train_dlmk2, val_dlmk2)

In [None]:
historymk2 += fit(epochs, 1e-6, modelmk2, train_dlmk2, val_dlmk2)

In [None]:
torch.save(modelmk2.state_dict(), 'model_mk2.pth')

In [1]:
historymk2[-1]

NameError: name 'historymk2' is not defined

# 迁移学习

In [None]:
class Rnet50(ModelBase):
    def __init__(self):
        super().__init__()
        self.network = torchvision.models.resnet50()
        # partially changing network to suit our data
        self.network.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2,
                                       padding=3, bias=False)
        self.network.fc = nn.Linear(2048, 2, bias=True)

    def forward(self, xb):
        out = self.network(xb)
        return out

In [None]:
modelmk3 = to_device(Rnet50(), device)

In [None]:
historymk3 = [evaluate(modelmk3, val_dl)]
historymk3

In [None]:
lr = 1e-6
epochs = 4
historymk3 += fit(epochs, lr, modelmk3, train_dl, val_dl)

In [None]:
import jovian

In [None]:
jovian.commit(filename='bird audio detection', project='bird-audio-detection',
              environment=None)