## 音频数据转图像

In [1]:
import os
import warnings

warnings.filterwarnings(action='ignore')

import pandas as pd
import librosa
import numpy as np

from sklearn.utils import shuffle
from PIL import Image
from tqdm import tqdm



In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

path = "/content/gdrive/My Drive/Colab Notebooks/AudioRecognition"
os.chdir(path)
print(os.getcwd())

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Colab Notebooks/AudioRecognition


In [3]:
# Load metadata file
Labels = pd.read_excel('./dataset/Labels.xlsx')
print(Labels)


      中文名                        英文名  Label_ID
0    丝光椋鸟        Red-billed Starling         1
1     中白鹭         Intermediate Egret         2
2    白腹蓝鹟  Blue-and-white Flycatcher         3
3    针尾沙锥           Pin-tailed Snipe         4
4     白腹鹞      Eastern Marsh-Harrier         5
5    远东苇莺    Manchurian Reed Warbler         6
6      池鹭         Chinese Pond-Heron         7
7    灰眉岩鹀               Rock Bunting         8
8    荒漠伯劳          Isabelline Shrike         9
9    北领角鸮         Japanese Scops-Owl        10
10   红背伯劳          Red-backed Shrike        11
11  古铜色卷尾             Bronzed Drongo        12
12   冠纹柳莺     Claudia's Leaf Warbler        13
13    家八哥                Common Myna        14
14     勺鸡           Koklass Pheasant        15
15   横斑林莺             Barred Warbler        16
16    松雀鹰                      Besra        17
17    草原鹞             Pallid Harrier        18
18   黄腹柳莺     Tickell's Leaf Warbler        19
19   灰脸鹟莺       Gray-cheeked Warbler        20


In [4]:
def get_train_spectrograms(filepath, output_dir):
    SAMPLE_RATE = 32000
    SIGNAL_LENGTH = 5 # seconds
    SPEC_SHAPE = (224, 224) # height x width
    FMIN = 20
    FMAX = 16000
    for i in range(20):
        i_dir = filepath + str(i + 1) + "/"
        for i_filename in os.listdir(i_dir):
            i_filename = i_dir + i_filename
            i_filename = '.' + i_filename.split('.')[:-1][1] + '.ogg'

            save_dir = output_dir + str(i + 1)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)

            save_path = save_dir + "/" + i_filename.split('/')[-1].split('.')[0] + '.png'
            if os.path.exists(save_path):
              continue

            signal, sr = librosa.load(i_filename, sr=SAMPLE_RATE, offset=None, duration=5)
            hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
            mel_spec = librosa.feature.melspectrogram(y=signal,
                                                    sr=SAMPLE_RATE,
                                                    n_fft=2048,
                                                    hop_length=hop_length,
                                                    n_mels=SPEC_SHAPE[0],
                                                    fmin=FMIN,
                                                    fmax=FMAX)

            mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

            # Normalize
            mel_spec -= mel_spec.min()
            mel_spec /= mel_spec.max()

            im = Image.fromarray(mel_spec * 255.0).convert("L")
            im.save(save_path)


get_train_spectrograms('./dataset/train/', './dataset/train/images/')

## 数据读取

In [5]:
import os

from sklearn.model_selection import train_test_split
import shutil

X_train = []
y_train = []
X_val = []
y_val = []

def Load_Train_Dataset(filepath):
    for i in range(20):
        i_dir = filepath + str(i + 1) + "/"
        i_data = []
        for i_filename in os.listdir(i_dir):
            i_filename = i_dir + i_filename
            i_data.append(i_filename)
        train, val = train_test_split(i_data, test_size=0.2, random_state=42)
        X_train.extend(train)
        y_train.extend([i + 1 for _ in range(len(train))])
        X_val.extend(val)
        y_val.extend([i + 1 for _ in range(len(val))])

Load_Train_Dataset("./dataset/train/images/")
dset_sizes = len(X_train)
dset_sizes_val = len(X_val)


In [6]:
pip install --upgrade efficientnet-pytorch



## 训练

In [6]:
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.parallel
from torch.autograd import Variable
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from efficientnet_pytorch import EfficientNet
import os
import time

In [14]:
momentum = 0.9
BATCH_SIZE = 32
class_num = 20
EPOCHS = 50
lr = 0.001
use_gpu = False
if torch.cuda.is_available():
    use_gpu = True
    print('using gpu')
net_name = 'efficientnet-b3'
DEVICE = torch.device('cuda' if use_gpu else 'cpu')

# 数据预处理
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

class CustomImageDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label


# 创建数据集
dataset_train = CustomImageDataset(X_train, y_train, transform=transform)
dataset_val = CustomImageDataset(X_val, y_val, transform=transform)

# 创建数据加载器
train_loader = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=False)

In [18]:
def exp_lr_scheduler(optimizer, epoch, init_lr=0.001, lr_decay_epoch=10):
    lr = init_lr * (0.8 ** (epoch // lr_decay_epoch))
    print('LR is set to {}'.format(lr))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

In [22]:
def train_model(model_ft, criterion, optimizer, lr_scheduler, num_epochs=50):
    train_loss = []
    since = time.time()
    best_model_wts = model_ft.state_dict()
    best_acc = 0.0
    model_ft.train(True)
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        optimizer = lr_scheduler(optimizer, epoch)
        running_loss = 0.0
        running_corrects = 0
        count = 0
        for data in train_loader:
            inputs, labels = data
            labels = torch.squeeze(labels.type(torch.LongTensor))
            labels -= 1
            if use_gpu:
                inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
            else:
                inputs, labels = Variable(inputs), Variable(labels)
            outputs = model_ft(inputs)
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs.data, 1)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            count += 1
            if count % 30 == 0 or outputs.size()[0] < BATCH_SIZE:
                print('Epoch:{}: loss:{:.3f}'.format(epoch, loss.item()))
                train_loss.append(loss.item())
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
        epoch_loss = running_loss / dset_sizes
        epoch_acc = running_corrects.double() / dset_sizes
        print('Loss: {:.4f} Acc: {:.4f}'.format(
            epoch_loss, epoch_acc))
        if epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = model_ft.state_dict()
    # save best model
    save_dir = './model'
    os.makedirs(save_dir, exist_ok=True)
    model_ft.load_state_dict(best_model_wts)
    model_out_path = save_dir + "/" + net_name + '.pth'
    torch.save(model_ft, model_out_path)
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    return train_loss, best_model_wts

In [23]:
model_ft = EfficientNet.from_pretrained('efficientnet-b3')
num_ftrs = model_ft._fc.in_features
model_ft._fc = nn.Linear(num_ftrs, class_num)
criterion = nn.CrossEntropyLoss()
if use_gpu:
    model_ft = model_ft.cuda()
    criterion = criterion.cuda()
optimizer = optim.Adam((model_ft.parameters()), lr=lr)
train_loss, best_model_wts = train_model(model_ft, criterion, optimizer, exp_lr_scheduler, num_epochs=EPOCHS)

Loaded pretrained weights for efficientnet-b3
Epoch 0/49
----------
LR is set to 0.001
Epoch:0: loss:2.054
Epoch:0: loss:1.806
Loss: 2.1640 Acc: 0.3763
Epoch 1/49
----------
LR is set to 0.001
Epoch:1: loss:0.889
Epoch:1: loss:1.568
Loss: 1.0137 Acc: 0.7114
Epoch 2/49
----------
LR is set to 0.001
Epoch:2: loss:0.440
Epoch:2: loss:1.100
Loss: 0.5717 Acc: 0.8347
Epoch 3/49
----------
LR is set to 0.001
Epoch:3: loss:0.405
Epoch:3: loss:0.199
Loss: 0.3850 Acc: 0.8913
Epoch 4/49
----------
LR is set to 0.001
Epoch:4: loss:0.172
Epoch:4: loss:0.445
Loss: 0.2450 Acc: 0.9297
Epoch 5/49
----------
LR is set to 0.001
Epoch:5: loss:0.170
Epoch:5: loss:0.465
Loss: 0.2580 Acc: 0.9269
Epoch 6/49
----------
LR is set to 0.001
Epoch:6: loss:0.054
Epoch:6: loss:0.436
Loss: 0.2331 Acc: 0.9269
Epoch 7/49
----------
LR is set to 0.001
Epoch:7: loss:0.032
Epoch:7: loss:0.027
Loss: 0.2226 Acc: 0.9251
Epoch 8/49
----------
LR is set to 0.001
Epoch:8: loss:0.133
Epoch:8: loss:0.961
Loss: 0.1615 Acc: 0.9516


## 验证集

In [57]:
correct = 0
total = 0


with torch.no_grad():
    for data in val_loader:
        inputs, labels = data
        labels = torch.squeeze(labels.type(torch.LongTensor))
        labels -= 1
        if use_gpu:
            inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
        else:
            inputs, labels = Variable(inputs), Variable(labels)
        outputs = model_ft(inputs)
        _, preds = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (preds == labels).sum().item()

print('Accuracy of the network: %.5f %%' % (100 * correct / total))

Accuracy of the network: 48.20144 %


## 测试集

In [56]:
import os
import pandas as pd
import torch
import librosa
import numpy as np

model = torch.load('./model/efficientnet-b3.pth')
model.eval()

import torchvision.transforms as transforms
from PIL import Image

transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
data = {'FileName': [], 'Predicted_ID': []}


def predict(filepath, output_dir):
    SAMPLE_RATE = 32000
    SIGNAL_LENGTH = 5 # seconds
    SPEC_SHAPE = (224, 224) # height x width
    FMIN = 20
    FMAX = 16000
    for filename in os.listdir(filepath):
        if filename.find('images') >= 0:
          continue
        i_filename = filepath + filename
        i_filename = '.' + i_filename.split('.')[:-1][1] + '.ogg'


        save_dir = output_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        save_path = save_dir + i_filename.split('/')[-1].split('.')[0] + '.png'

        signal, sr = librosa.load(i_filename, sr=SAMPLE_RATE, offset=None, duration=5)
        hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
        mel_spec = librosa.feature.melspectrogram(y=signal,
                                                sr=SAMPLE_RATE,
                                                n_fft=2048,
                                                hop_length=hop_length,
                                                n_mels=SPEC_SHAPE[0],
                                                fmin=FMIN,
                                                fmax=FMAX)

        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize
        mel_spec -= mel_spec.min()
        mel_spec /= mel_spec.max()

        im = Image.fromarray(mel_spec * 255.0).convert("L")
        im.save(save_path)

        im = transform(im)
        im.unsqueeze_(0)
        im = im.to(device)
        p = model(im)[0].cpu()
        print(p)
        idx = p.argmax()
        score = p[idx]
        print(filename + ' score: ', score.item())
        data['FileName'].append(filename)
        if score > 0.75:
          data['Predicted_ID'].append(idx.item() + 1)
        else:
          data['Predicted_ID'].append(0)


predict('./dataset/test1_easy/', './dataset/test1_easy/images/')

results = pd.DataFrame(data, columns=['FileName', 'Predicted_ID'])
results = results.sort_values('FileName')
results.head()
results.to_csv('./dataset/submission_easy.csv', index=False)

torch.Size([1, 20])
tensor([-3.3581, -4.6178,  0.0639, -5.2362, -4.2734, -4.1184, -1.6567, -0.2691,
        -4.0477, -5.3160, -3.8203, -1.5596,  0.2587, -3.0499, 12.0996,  2.0105,
        -0.6018,  1.5999,  0.4135, -3.1113], grad_fn=<ToCopyBackward0>)
TEST181.ogg score:  12.099565505981445
torch.Size([1, 20])
tensor([ 5.5533, -1.9240, -4.9149,  4.2735, -5.5876,  2.0896, -1.4496, -2.3201,
        -4.2411, -1.7491, -1.0870,  1.8278, -2.2162,  0.7563, -1.9376, -4.1568,
        -3.7145, -4.3586, -0.9388, -2.3824], grad_fn=<ToCopyBackward0>)
TEST187.ogg score:  5.553257465362549
torch.Size([1, 20])
tensor([-1.6047, -5.0604,  1.9999, -3.3537, -2.4874, -1.5719, -4.4532, -0.9304,
        -2.3947, -3.6556, -4.3040,  1.8282, -1.4893, -2.7664, -0.1469,  0.6109,
        14.3780, -1.1050, -0.6655,  0.5274], grad_fn=<ToCopyBackward0>)
TEST189.ogg score:  14.377985000610352
torch.Size([1, 20])
tensor([ -7.7990, -10.8523,  -4.3281,  -4.9188,  -7.2311,  -1.5582,  -4.1805,
         -4.7832,  -1.1270,  -

In [59]:
import os
import pandas as pd
import torch
import librosa
import numpy as np

model = torch.load('./model/efficientnet-b3.pth')
model.eval()

import torchvision.transforms as transforms
from PIL import Image

transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
data = {'FileName': [], 'Predicted_ID': []}


def predict(filepath, output_dir):
    SAMPLE_RATE = 32000
    SIGNAL_LENGTH = 5 # seconds
    SPEC_SHAPE = (224, 224) # height x width
    FMIN = 20
    FMAX = 16000
    for filename in os.listdir(filepath):
        if filename.find('images') >= 0:
          continue
        i_filename = filepath + filename
        i_filename = '.' + i_filename.split('.')[:-1][1] + '.ogg'


        save_dir = output_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        save_path = save_dir + i_filename.split('/')[-1].split('.')[0] + '.png'

        signal, sr = librosa.load(i_filename, sr=SAMPLE_RATE, offset=None, duration=5)
        hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
        mel_spec = librosa.feature.melspectrogram(y=signal,
                                                sr=SAMPLE_RATE,
                                                n_fft=2048,
                                                hop_length=hop_length,
                                                n_mels=SPEC_SHAPE[0],
                                                fmin=FMIN,
                                                fmax=FMAX)

        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize
        mel_spec -= mel_spec.min()
        mel_spec /= mel_spec.max()

        im = Image.fromarray(mel_spec * 255.0).convert("L")
        im.save(save_path)

        im = transform(im)
        im.unsqueeze_(0)
        im = im.to(device)
        p = model(im)[0].cpu()
        print(p)
        idx = p.argmax()
        score = p[idx]
        print(filename + ' score: ', score.item())
        data['FileName'].append(filename)
        if score > 0.75:
          data['Predicted_ID'].append(idx.item() + 1)
        else:
          data['Predicted_ID'].append(0)


predict('./dataset/test2_hard/', './dataset/test2_hard/images/')

results = pd.DataFrame(data, columns=['FileName', 'Predicted_ID'])
results = results.sort_values('FileName')
results.head()
results.to_csv('./dataset/submission_hard.csv', index=False)

tensor([-2.6718, -6.8242, -2.9994, -6.7388, -3.3221, -3.3498, -7.7549, -1.5313,
        -5.3243, -7.8576, -3.8581, -2.3552,  3.3851, -0.5436, -3.1432, -4.8147,
        -1.0051, -7.2115,  1.4699, 14.1061], grad_fn=<ToCopyBackward0>)
TEST1182.ogg score:  14.10611629486084
tensor([-3.2488,  0.2506, -1.6406,  0.5870, -4.7545, -1.9584, 13.8303, -2.9609,
        -2.4672, -3.2987, -4.6312, -1.5451, -2.5298,  1.4690, -3.1115, -4.0271,
        -7.3989,  0.6812, -3.7002,  2.1912], grad_fn=<ToCopyBackward0>)
TEST1186.ogg score:  13.830338478088379
tensor([ 4.2172, -5.7935, -3.2955,  0.5892, -5.3863,  5.9951, -7.8882, -2.8830,
        -4.3333, -4.9935,  2.3928, -3.4681,  2.9395,  0.6459, -5.8655, -4.5806,
        -6.3887, -7.5794, -0.6856, -4.1632], grad_fn=<ToCopyBackward0>)
TEST1170.ogg score:  5.995135307312012
tensor([-0.0787, -0.5412, -1.4374, -0.1150, -1.6518, -2.2089, -0.1810, -0.1444,
         0.1661, -1.9214, -3.8357, -1.2956,  2.6176, -1.5840,  4.9589, -0.5164,
         2.0962,  1.3726, 