In [1]:
DATA_DIR = '../input/hse-acoustic-event-detection-2022'

train_data_dir = 'audio_train/train'
train_meta_fname = 'train.csv'

test_data_dir = 'audio_test/test'
test_meta_fname = 'sample_submission.csv'

In [2]:
! git clone -q https://github.com/YuanGongND/ast
! pip install -q timm==0.4.5
! pip install -q wget

import sys
import os, csv, argparse, wget
import torch, torchaudio, timm
import numpy as np
from torch.cuda.amp import autocast
import IPython

from torch.utils.data import Dataset, DataLoader

sys.path.append('./ast')

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 16

[0m

In [3]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch.nn as nn

from sklearn.metrics import f1_score

train_loss_fn = nn.CrossEntropyLoss()

def val_loss_fn(predictions, target):
    return f1_score(predictions, target, average='macro')

In [4]:
# Feature extractor
def make_features(waveform, sr, mel_bins=128, target_length=1024):
    
    assert sr == 16000, 'input audio sampling rate must be 16kHz'

    fbank = torchaudio.compliance.kaldi.fbank(
        waveform, htk_compat=True, sample_frequency=sr, use_energy=False,
        window_type='hanning', num_mel_bins=mel_bins, dither=0.0, frame_shift=10)

    n_frames = fbank.shape[0]

    p = target_length - n_frames
    if p > 0:
        m = torch.nn.ZeroPad2d((0, 0, 0, p))
        fbank = m(fbank)
    elif p < 0:
        fbank = fbank[0:target_length, :]

    fbank = (fbank - (-4.2677393)) / (4.5689974 * 2)
    return fbank

In [5]:
from src.models import ASTModel

# Отрезаем mlp_head у модели (чтобы хоть что-то обучить)
class MyASTModel(ASTModel):
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.mlp_head = None
    
    @autocast()
    def forward(self, x):
        """
        :param x: the input spectrogram, expected shape: (batch_size, time_frame_num, frequency_bins), e.g., (12, 1024, 128)
        :return: prediction
        """
        # expect input x = (batch_size, time_frame_num, frequency_bins), e.g., (12, 1024, 128)
        x = x.unsqueeze(1)
        x = x.transpose(2, 3)

        B = x.shape[0]
        x = self.v.patch_embed(x)
        cls_tokens = self.v.cls_token.expand(B, -1, -1)
        dist_token = self.v.dist_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, dist_token, x), dim=1)
        x = x + self.v.pos_embed
        x = self.v.pos_drop(x)
        for blk in self.v.blocks:
            x = blk(x)
        x = self.v.norm(x)
        x = (x[:, 0] + x[:, 1]) / 2

        return x

In [6]:
# Загружаем веса модели
checkpoint_path = 'ast/pretrained_models/audio_mdl.pth'

audioset_mdl_url = 'https://www.dropbox.com/s/cv4knew8mvbrnvq/audioset_0.4593.pth?dl=1'
if os.path.exists(checkpoint_path) == False:
    print('Downloading weights...')
    wget.download(audioset_mdl_url, out=checkpoint_path)
    print('Complete!')

Downloading weights...
Complete!


In [7]:
# Входные и выходные размерности предобученной модели
input_tdim = 1024
output_dim = 527

# Архитектура, соответствующая загруженным весам
ast_mdl = MyASTModel(label_dim=output_dim, input_tdim=input_tdim, imagenet_pretrain=False, audioset_pretrain=False)
checkpoint = torch.load(checkpoint_path, map_location=DEVICE)

# Оборачиваем модель в DataParallel, как в оригинальной работе (для загрузки весов)
audio_model = torch.nn.DataParallel(ast_mdl, device_ids=[0])
audio_model.load_state_dict(checkpoint)
audio_model = audio_model.to(DEVICE)
audio_model.eval();

---------------AST Model Summary---------------
ImageNet pretraining: False, AudioSet pretraining: False
frequncey stride=10, time stride=10
number of patches=1212


In [8]:
class AST_train_dataset(Dataset):
    def __init__(self, data_dir: str, meta: pd.DataFrame, transforms=None):
        
        # Имена фалов в датасете
        self.filenames = meta['fname'].tolist()
                
        # Создаем словарь меток файлов
        labels_dict = dict()
        
        for filename, label in zip(meta['fname'], meta['label_encoded']):
            
            labels_dict[filename] = label

        self.labels_dict = labels_dict
        self.transforms = transforms
        self.data_dir = data_dir
        
    def __getitem__(self, index):
        
        filename = self.filenames[index]
        
        # Загружвем wav файл и считаем фходные фичи модели
        waveform, sr = torchaudio.load(
            Path(DATA_DIR, self.data_dir, filename))
        
        # Если даны преобразования, пременяем их к wav файлу
        if self.transforms is not None:
            waveform = self.transforms(waveform)
            
        frets = make_features(waveform, sr)
        label = self.labels_dict[filename]
        
        return {'frets': frets, 
                'label': label}
    
    def __len__(self):
        return len(self.filenames)

In [9]:
class AST_test_dataset(Dataset):
    def __init__(self, data_dir: str, meta: pd.DataFrame):
        
        # Имена фалов в датасете
        self.filenames = meta['fname'].tolist()
        self.data_dir = data_dir
        
    def __getitem__(self, index):
        
        filename = self.filenames[index]
        
        # Загружвем wav файл и считаем фходные фичи модели
        waveform, sr = torchaudio.load(
            Path(DATA_DIR, self.data_dir, filename))
            
        frets = make_features(waveform, sr)
        
        return {'frets': frets}
    
    def __len__(self):
        return len(self.filenames)

In [10]:
@torch.no_grad()
def extract_features(audio_model, dataloader, return_labels=True):
    
    model_outputs = []
    labels = []
    
    pbar = tqdm(iterable=dataloader)

    for batch in pbar:

        frets = batch['frets'].to(DEVICE)

        with torch.no_grad():
            with autocast():
                output = audio_model.forward(frets)

        model_outputs.append(output.cpu().detach().numpy())

        if return_labels:
            labels += batch['label'].tolist()
            
    return np.vstack(model_outputs), np.array(labels)

In [17]:
if __name__ == '__main__':
    
    # Загружаем csv
    df_train_val = pd.read_csv(Path(DATA_DIR, train_meta_fname))
    df_test = pd.read_csv(Path(DATA_DIR, test_meta_fname))
    
    # Задаем метки классов
    n_classes = df_train_val.label.nunique()
    classes_dict = {cl:i for i,cl in enumerate(df_train_val.label.unique())}
    df_train_val['label_encoded'] = df_train_val.label.map(classes_dict)
    
    # Делим на train и val (по label_encoded)
    df_train, df_val = train_test_split(df_train_val, test_size=0.2, stratify=df_train_val['label_encoded'])
    print(f'df_train length: {len(df_train)}')
    print(f'df_val length:   {len(df_val)}')
    print(f'df_test length:  {len(df_test)}')
    
    # Создаем датасеты и даталодеры
    train_dataset = AST_train_dataset(train_data_dir, df_train)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    
    val_dataset = AST_train_dataset(train_data_dir, df_val)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    
    test_dataset = AST_test_dataset(test_data_dir, df_test)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    # Cоздаем директорию для сохранения извлеченных фичей
    !mkdir features

    # Извлекаем фичи и сохраням
    train_features, train_labels = extract_features(audio_model, train_dataloader)
    np.save('features/train_features', train_features)
    np.save('features/train_labels', train_labels)

    val_features, val_labels = extract_features(audio_model, val_dataloader)
    np.save('features/val_features', val_features)
    np.save('features/val_labels', val_labels)
    
    test_features, _ =  extract_features(audio_model, test_dataloader, return_labels=False)
    np.save('features/test_features', test_features)
    
    # Архивируем для скачивния
    !zip -r features.zip features

df_train length: 4546
df_val length:   1137
df_test length:  3790


100%|██████████| 237/237 [03:45<00:00,  1.05it/s]


updating: features/ (stored 0%)
updating: features/test_features.npy (deflated 8%)
updating: features/val_labels.npy (deflated 83%)
updating: features/val_features.npy (deflated 8%)
updating: features/train_features.npy (deflated 8%)
updating: features/train_labels.npy (deflated 85%)


### MLP Classifier (отдельный ноутбук)

In [18]:
class MLP_dataset(Dataset):
    def __init__(self, features, labels):
        
        self.features = features
        self.labels = labels
        
    def __getitem__(self, index):
        
        return {'features': self.features[index], 
                'labels': self.labels[index]}
    
    def __len__(self):
        return len(self.labels)

In [19]:
def train_epoch(model, optimizer,
                dataloader, loss_fn, epoch):
        
    total_loss = 0
    processed = 0

    pbar = tqdm(iterable=dataloader,
                desc=f'epoch {epoch}')

    for batch in pbar:
        features = batch['features'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        output = model(features)
        loss = loss_fn(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(batch)
        processed += len(batch)

        pbar.set_postfix({'batch loss': total_loss / processed})

In [20]:
@torch.no_grad()
def evaluate_epoch(model, dataloader, epoch):
    
    model.eval()
    
    pbar = tqdm(iterable=dataloader,
                desc=f'epoch {epoch}')
    
    predictions = []
    targets = []

    for batch in pbar:
        features = batch['features'].to(DEVICE)
        labels = batch['labels']

        output = model(features)
        
        predictions += torch.argmax(output, axis=1).cpu().detach().tolist()
        targets += labels.tolist()
        
    print(val_loss_fn(predictions, targets))

In [21]:
@torch.no_grad()
def predict(model, dataloader, epoch):
    
    model.eval()
    
    pbar = tqdm(iterable=dataloader,
                desc=f'epoch {epoch}')
    
    predictions = []

    for batch in pbar:
        features = batch['features'].to(DEVICE)
        output = model(features)
        
        predictions += torch.argmax(output, axis=1).cpu().detach().tolist()
        
    return predictions

In [22]:
def train(model, optimizer, 
          train_dataloader, val_dataloader,
          train_loss_fn, val_loss_fn,
          n_epochs: int = 51, eval_every: int = 5):
    
    for epoch in range(n_epochs):
        
        train_epoch(model, optimizer, train_dataloader, train_loss_fn, epoch)
        
        if eval_every is not None and epoch % eval_every == 0:
            evaluate_epoch(model, val_dataloader, epoch)

In [23]:
classifier = nn.Sequential(
    nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, 41)
).to(DEVICE)

optimizer = torch.optim.Adam(classifier.parameters())

MLP_train_dataset = MLP_dataset(train_features, train_labels)
MLP_train_dataloader = DataLoader(MLP_train_dataset, batch_size=4546)

MLP_val_dataset = MLP_dataset(val_features, val_labels)
MLP_val_dataloader = DataLoader(MLP_val_dataset, batch_size=256)

train(classifier, optimizer,
      MLP_train_dataloader, MLP_val_dataloader,
      train_loss_fn, val_loss_fn)

epoch 0: 100%|██████████| 1/1 [00:00<00:00, 11.51it/s, batch loss=3.8]
epoch 0: 100%|██████████| 5/5 [00:00<00:00, 355.67it/s]


0.14144103708118091


epoch 1: 100%|██████████| 1/1 [00:00<00:00, 20.57it/s, batch loss=3.41]
epoch 2: 100%|██████████| 1/1 [00:00<00:00, 26.56it/s, batch loss=3.08]
epoch 3: 100%|██████████| 1/1 [00:00<00:00, 28.50it/s, batch loss=2.74]
epoch 4: 100%|██████████| 1/1 [00:00<00:00, 29.06it/s, batch loss=2.41]
epoch 5: 100%|██████████| 1/1 [00:00<00:00, 29.52it/s, batch loss=2.09]
epoch 5: 100%|██████████| 5/5 [00:00<00:00, 452.64it/s]


0.4999729935277006


epoch 6: 100%|██████████| 1/1 [00:00<00:00, 29.47it/s, batch loss=1.81]
epoch 7: 100%|██████████| 1/1 [00:00<00:00, 27.78it/s, batch loss=1.57]
epoch 8: 100%|██████████| 1/1 [00:00<00:00, 29.45it/s, batch loss=1.37]
epoch 9: 100%|██████████| 1/1 [00:00<00:00, 29.79it/s, batch loss=1.2]
epoch 10: 100%|██████████| 1/1 [00:00<00:00, 29.31it/s, batch loss=1.06]
epoch 10: 100%|██████████| 5/5 [00:00<00:00, 490.49it/s]


0.7374544405497462


epoch 11: 100%|██████████| 1/1 [00:00<00:00, 29.46it/s, batch loss=0.96]
epoch 12: 100%|██████████| 1/1 [00:00<00:00, 29.62it/s, batch loss=0.881]
epoch 13: 100%|██████████| 1/1 [00:00<00:00, 29.88it/s, batch loss=0.818]
epoch 14: 100%|██████████| 1/1 [00:00<00:00, 29.74it/s, batch loss=0.763]
epoch 15: 100%|██████████| 1/1 [00:00<00:00, 29.78it/s, batch loss=0.718]
epoch 15: 100%|██████████| 5/5 [00:00<00:00, 484.20it/s]


0.7756435359611215


epoch 16: 100%|██████████| 1/1 [00:00<00:00, 42.73it/s, batch loss=0.681]
epoch 17: 100%|██████████| 1/1 [00:00<00:00, 43.48it/s, batch loss=0.651]
epoch 18: 100%|██████████| 1/1 [00:00<00:00, 42.33it/s, batch loss=0.625]
epoch 19: 100%|██████████| 1/1 [00:00<00:00, 41.53it/s, batch loss=0.6]
epoch 20: 100%|██████████| 1/1 [00:00<00:00, 42.17it/s, batch loss=0.576]
epoch 20: 100%|██████████| 5/5 [00:00<00:00, 516.83it/s]


0.7928821778830569


epoch 21: 100%|██████████| 1/1 [00:00<00:00, 45.97it/s, batch loss=0.554]
epoch 22: 100%|██████████| 1/1 [00:00<00:00, 43.71it/s, batch loss=0.534]
epoch 23: 100%|██████████| 1/1 [00:00<00:00, 45.19it/s, batch loss=0.517]
epoch 24: 100%|██████████| 1/1 [00:00<00:00, 43.49it/s, batch loss=0.501]
epoch 25: 100%|██████████| 1/1 [00:00<00:00, 46.03it/s, batch loss=0.485]
epoch 25: 100%|██████████| 5/5 [00:00<00:00, 552.54it/s]


0.8116656659595628


epoch 26: 100%|██████████| 1/1 [00:00<00:00, 44.35it/s, batch loss=0.47]
epoch 27: 100%|██████████| 1/1 [00:00<00:00, 40.81it/s, batch loss=0.454]
epoch 28: 100%|██████████| 1/1 [00:00<00:00, 42.66it/s, batch loss=0.44]
epoch 29: 100%|██████████| 1/1 [00:00<00:00, 43.49it/s, batch loss=0.426]
epoch 30: 100%|██████████| 1/1 [00:00<00:00, 42.64it/s, batch loss=0.413]
epoch 30: 100%|██████████| 5/5 [00:00<00:00, 555.10it/s]


0.8251139822617777


epoch 31: 100%|██████████| 1/1 [00:00<00:00, 41.44it/s, batch loss=0.401]
epoch 32: 100%|██████████| 1/1 [00:00<00:00, 43.48it/s, batch loss=0.389]
epoch 33: 100%|██████████| 1/1 [00:00<00:00, 43.92it/s, batch loss=0.377]
epoch 34: 100%|██████████| 1/1 [00:00<00:00, 42.19it/s, batch loss=0.367]
epoch 35: 100%|██████████| 1/1 [00:00<00:00, 44.89it/s, batch loss=0.356]
epoch 35: 100%|██████████| 5/5 [00:00<00:00, 620.53it/s]


0.826497140542408


epoch 36: 100%|██████████| 1/1 [00:00<00:00, 45.03it/s, batch loss=0.345]
epoch 37: 100%|██████████| 1/1 [00:00<00:00, 43.13it/s, batch loss=0.335]
epoch 38: 100%|██████████| 1/1 [00:00<00:00, 42.43it/s, batch loss=0.324]
epoch 39: 100%|██████████| 1/1 [00:00<00:00, 41.54it/s, batch loss=0.315]
epoch 40: 100%|██████████| 1/1 [00:00<00:00, 45.05it/s, batch loss=0.305]
epoch 40: 100%|██████████| 5/5 [00:00<00:00, 613.81it/s]


0.8247355978442672


epoch 41: 100%|██████████| 1/1 [00:00<00:00, 43.78it/s, batch loss=0.296]
epoch 42: 100%|██████████| 1/1 [00:00<00:00, 44.54it/s, batch loss=0.287]
epoch 43: 100%|██████████| 1/1 [00:00<00:00, 43.14it/s, batch loss=0.279]
epoch 44: 100%|██████████| 1/1 [00:00<00:00, 41.96it/s, batch loss=0.27]
epoch 45: 100%|██████████| 1/1 [00:00<00:00, 45.24it/s, batch loss=0.263]
epoch 45: 100%|██████████| 5/5 [00:00<00:00, 525.88it/s]


0.8233587290245713


epoch 46: 100%|██████████| 1/1 [00:00<00:00, 43.79it/s, batch loss=0.255]
epoch 47: 100%|██████████| 1/1 [00:00<00:00, 45.05it/s, batch loss=0.248]
epoch 48: 100%|██████████| 1/1 [00:00<00:00, 45.18it/s, batch loss=0.24]
epoch 49: 100%|██████████| 1/1 [00:00<00:00, 42.88it/s, batch loss=0.233]
epoch 50: 100%|██████████| 1/1 [00:00<00:00, 44.48it/s, batch loss=0.227]
epoch 50: 100%|██████████| 5/5 [00:00<00:00, 614.14it/s]

0.8243328745303161





In [24]:
class MLPTestDataset(Dataset):
    def __init__(self, features):
        
        self.features = features
        
    def __getitem__(self, index):
        
        return {'features': self.features[index]}
    
    def __len__(self):
        return len(self.features)

In [25]:
MLP_test_dataset = MLPTestDataset(test_features)
MLP_test_dataloader = DataLoader(MLP_test_dataset, batch_size = 128)

predictions = predict(classifier, MLP_test_dataloader, epoch=0)

epoch 0: 100%|██████████| 30/30 [00:00<00:00, 1055.92it/s]


In [26]:
from copy import copy
submission = df_test.copy()
submission['label'] = predictions
decoder = {v: k for (k, v) in classes_dict.items()}
submission['label'] = submission['label'].map(decoder)
submission.to_csv('submission.csv', index=None)