In [1]:
import os
import time
import random
import warnings
import typing as tp
import pathlib
from contextlib import contextmanager

import cv2
import librosa

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import IPython.display as ipd

import torch.nn as nn
import torch.optim as optim

from torchvision import datasets, models, transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau


In [2]:
import python_speech_features as psf
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
data_path = pathlib.Path("../data")
audios_path = data_path / "all_audio_resampled"

In [4]:
train = pd.read_csv(data_path/'Train.csv')
train_extra = pd.read_csv(data_path/'train_add.csv')
train_extra_2 = pd.read_csv(data_path/'train_add_20201029.csv')

label2code = {word: idx for idx, word in enumerate(train.label.unique().tolist())}
code2label = {v:k for k,v in label2code.items()}

In [5]:
def get_image_path(audio_path):
    file_name = audio_path.split("/")[-1].split(".")[0]
    ip = str(audios_path.resolve() / f"{file_name}.wav")
    return ip

train["image_fn"] = train.fn.apply(get_image_path)
train_extra["image_fn"] = train_extra.fn.apply(get_image_path)
train_extra_2["image_fn"] = train_extra_2.fn.apply(get_image_path)

In [6]:
train_df = pd.concat([train, train_extra, train_extra_2], axis=0).reset_index(drop=True)

In [7]:
### build validation that includes all classes:


vcs = train_df.label.value_counts()

## possible schema:
# 25+ - take 3
# 12-25 - take 2
# 12- - take 1

def num_for_val(num_examples):
    if num_examples >= 25:
        return 3
    if num_examples >= 12:
        return 2
    return 1

train_df["num_examples"] = train_df.label.map(vcs.to_dict())
train_df["num_for_val"] = train_df.num_examples.apply(num_for_val)

random.seed(12)
train_df_new = pd.DataFrame()
for label in train_df.label.unique():
    tmp = train_df.loc[train_df.label == label].copy()
    tmp["dummy"] = tmp.label.apply(lambda _: random.random())
    tmp.sort_values(by="dummy", inplace=True)
    tmp["rank"] = range(tmp.shape[0])
    train_df_new = pd.concat([train_df_new, tmp])

train_df_new.reset_index(drop=True, inplace=True)
train_df_new["val_subset"] = train_df_new.num_for_val > train_df_new["rank"]
train_df_new.drop("dummy", axis=1, inplace=True)

In [8]:
assert (
    set(train_df_new.loc[train_df_new.val_subset].label.unique()) == 
    set(train_df_new.loc[~train_df_new.val_subset].label.unique())
)

In [9]:
train_df_new["val_fold"] = train_df_new["rank"] // train_df_new.num_for_val

In [10]:
for f in range(10):
    print(f, (train_df_new.val_fold == f).sum())

0 451
1 451
2 451
3 430
4 413
5 410
6 407
7 391
8 381
9 337


In [11]:
N_FOLDS = 3
train_folds = list()
val_folds = list()

for i in range(N_FOLDS):
    valf = train_df_new.loc[train_df_new.val_fold == i].copy()
    trf = train_df_new.loc[train_df_new.val_fold != i].copy()
    
    train_folds.append(trf)
    val_folds.append(valf)

In [12]:
SR = 22050

In [13]:
from data_processing import new_generate_spec, new_build_image, normalize, MEAN, STD
from transforms import *

In [14]:
class AudioConfig:
    n_fft = 512
    hop_size = 32
    pad_center = True
    trim = True
    max_len_sec = 2.6
    sr = 22050
    img_size = 299
    
conf = AudioConfig()

In [15]:
aug_noise = AddNoise(0, 0.06)
aug_pitch = PitchShift((-7, 7), sr=conf.sr)

train_transforms = UseWithProb(
    OneOf([
        aug_noise,
        aug_pitch
    ]),
    prob=0.45
)

In [16]:
class SpectrogramDataset(data.Dataset):
    def __init__(
        self,
        file_list: tp.List[tp.List[str]],
        config,
        transform=None,
        normalize=True
    ):
        self.file_list = file_list  # list of list: [file_path, label]
        self.transform = transform
        self.normalize = normalize
        self.config = config

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx: int):
        fn, word = self.file_list[idx]
        audio, _ = librosa.core.load(fn, sr=SR)

        if self.transform:
            audio = self.transform(audio)

        image = new_build_image(audio, self.config)
        
        if self.normalize:
            norm_image = normalize(np.array(image), mean=MEAN, std=STD)
        else:
            norm_image = image
        
        return np.moveaxis(norm_image, 2, 0), label2code[word]
    
    
class SpectrogramTestDataset(data.Dataset):
    def __init__(
        self,
        file_list: tp.List[tp.List[str]],
        config,
        transform=None,
        normalize=True
    ):
        self.file_list = file_list  # list of list: [file_path, label]
        self.transform = transform
        self.normalize = normalize
        self.config = config

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx: int):
        fn, word = self.file_list[idx]
        audio, _ = librosa.core.load(fn, sr=SR)

        if self.transform:
            audio = self.transform(audio)

        image = new_build_image(audio, self.config)
        
        if self.normalize:
            norm_image = normalize(np.array(image), mean=MEAN, std=STD)
        else:
            norm_image = image
        
        return np.moveaxis(norm_image, 2, 0)

In [17]:
sdf_train_list = [
    SpectrogramDataset(t[["image_fn", "label"]].values.tolist(), conf,
                       transform=train_transforms, normalize=True)
    for t in train_folds
]

sdf_val_list = [
    SpectrogramDataset(v[["image_fn", "label"]].values.tolist(), conf, normalize=True)
    for v in val_folds
]

for s in sdf_train_list:
    print(len(s))
    
print("=========================")

for s in sdf_val_list:
    print(len(s))

4258
4258
4258
451
451
451


In [18]:
def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

In [19]:
def mixup_data(x, y, alpha=1.0, use_cuda=True):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    if use_cuda:
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def train_mixup_epoch(log_interval, mixup_prob, model, device, criterion, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        use_mixup = False
        if random.random() < mixup_prob:
            use_mixup = True
        data, target = data.type(torch.FloatTensor).to(device), target.to(device)
        
        if use_mixup:
            data, y_a, y_b, lam = mixup_data(data, target)
        optimizer.zero_grad()
        output = model(data)
        if use_mixup:
            loss = mixup_criterion(criterion, output, y_a, y_b, lam) #criterion(output, target)
        else:
            loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
    return loss.item()

            
def test(model, device, criterion, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.type(torch.FloatTensor).to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    
    accuracy = 100. * correct / len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        accuracy))

    return test_loss, accuracy

In [20]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


def train_on_fold(
    model,
    train_loader,
    val_loader,
    fold,
    criterion,
    optimizer,
    lr_scheduler,
    results_folder,
    epochs
):
    seed_dict = {
        0: 9,
        1: 99,
        2: 999
    }
    set_seed(seed_dict[fold])

    t0 = time.time()

    best_loss = 1e5
    best_acc = 0

    max_patience = 20
    patience = 0

    train_loss_hist = list()
    val_loss_hist = list()
    val_acc_hist = list()

    save_each_epoch = False

    for ep in range(65):
        train_loss = train_mixup_epoch(1e10, 0.667, model, device, criterion, train_loader, optimizer, ep)
        cur_loss, cur_acc = test(model, device, criterion, val_loader)

        train_loss_hist.append(train_loss)
        val_loss_hist.append(cur_loss)
        val_acc_hist.append(cur_acc)

        if save_each_epoch:
            torch.save(model.state_dict(), f"tmp/{results_folder}/model_ep_{fold}_{ep}.pth")

        if cur_loss < best_loss:
            torch.save(model.state_dict(), f"tmp/{results_folder}/best_run_{fold}.pth")
            best_loss = cur_loss
            best_acc = cur_acc
            patience = 0

        lr_scheduler.step(cur_loss) 
        print("Training so far {} minutes".format((time.time() - t0) / 60))
        print("="*20)
        
    for ep in range(15):
        train_loss = train_mixup_epoch(1e10, 0.0, model, device, criterion, train_loader, optimizer, ep)
        cur_loss, cur_acc = test(model, device, criterion, val_loader)

        train_loss_hist.append(train_loss)
        val_loss_hist.append(cur_loss)
        val_acc_hist.append(cur_acc)

        if save_each_epoch:
            torch.save(model.state_dict(), f"tmp/{results_folder}/model_ep_{fold}_{ep}.pth")

        if cur_loss < best_loss:
            torch.save(model.state_dict(), f"tmp/{results_folder}/best_run_{fold}.pth")
            best_loss = cur_loss
            best_acc = cur_acc
            patience = 0

        lr_scheduler.step(cur_loss)
        print("Training so far {} minutes".format((time.time() - t0) / 60))
        print("="*20)

    print("time spent training: {} minutes".format((time.time() - t0) / 60))
    print("BEST LOSS:", best_loss)
    print("BEST ACC:", best_acc)
    
    model.load_state_dict(torch.load(f"tmp/{results_folder}/best_run_{fold}.pth"))
    
    return model


def predict_on_val(
    model,
    val_loader,
    device="cuda"
):
    sfm = nn.Softmax()
    predictions = list()
    for batch_idx, (inputs, _) in enumerate(val_loader):
        inputs = inputs.type(torch.FloatTensor).to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            predictions.append(sfm(outputs))

    predictions = np.concatenate([t.cpu().numpy() for t in predictions])
    preds = np.argmax(predictions, axis=1)
    return preds


def get_val_outputs(model, test_loader, device="cuda"):
    outputs_list = list()
    for batch_idx, (inputs, target) in enumerate(test_loader):
        inputs = inputs.type(torch.FloatTensor).to(device)

        # forward
        # track history if only in train
        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            outputs_list.append(outputs)
    outputs_list = np.concatenate([t.cpu().numpy() for t in outputs_list])
    return outputs_list

def get_predictions(model, test_loader, device="cuda"):
    sfm = nn.Softmax()
    predictions = list()
    for batch_idx, inputs in enumerate(test_loader):
        inputs = inputs.type(torch.FloatTensor).to(device)

        # forward
        # track history if only in train
        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            predictions.append(sfm(outputs)) ## ADD SOFTMAX
    predictions = np.concatenate([t.cpu().numpy() for t in predictions])
    return predictions

In [21]:
# model_ft.load_state_dict(torch.load(f"tmp/{tmp_folder_name}/best_run_{FOLD}.pth"))

In [22]:
n_classes = len(label2code)
device = torch.device("cuda")

results_folder = "resnext_aug_mixup_repr_folds"
os.mkdir(f"tmp/{results_folder}")
BS = 32
for FOLD in range(N_FOLDS):
    print("PROCESSING FOLD", FOLD)
    
    model_ft = models.resnext50_32x4d(pretrained=True)
    model_ft.fc = nn.Sequential(
        nn.Linear(model_ft.fc.in_features, n_classes)
    )
    model_ft = model_ft.to(device)

    criterion = nn.CrossEntropyLoss(reduction="sum")
    optimizer_ft = optim.Adam(model_ft.parameters(), lr=3e-4)
    lr_scheduler = ReduceLROnPlateau(optimizer_ft, 'min', patience=4, factor=0.5, verbose=True, min_lr=1.5e-5)

    train_loader = torch.utils.data.DataLoader(sdf_train_list[FOLD], batch_size=BS, shuffle=True,
                                               num_workers=6, drop_last=False, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(sdf_val_list[FOLD], batch_size=BS, drop_last=False, pin_memory=True,
                                        num_workers=6)
    
    # full train cycle with one fold for cross-val:
    model_ft = train_on_fold(
        model_ft,
        train_loader,
        val_loader,
        FOLD,
        criterion,
        optimizer_ft,
        lr_scheduler,
        results_folder,
        None
    )
    
    # predict from one model:
    preds = predict_on_val(
        model_ft,
        val_loader
    )
    val_folds[FOLD]["preds"] = [code2label[c] for c in preds]
    
    # Save val predictions from one model:
    val_out = get_val_outputs(model_ft, val_loader)
    gt = val_folds[FOLD].label.map(label2code).values
    val_losses = list()
    
    for idx in range(len(gt)):
        item_loss = criterion(torch.Tensor([val_out[idx]]), torch.LongTensor([gt[idx]])).numpy()
        val_losses.append(item_loss)
    val_losses = np.array(val_losses)
    val_folds[FOLD]["loss"] = val_losses
    val_folds[FOLD].reset_index(drop=True).sort_values(by="loss", ascending=True).to_csv(
        f"tmp/{results_folder}/val_loss_{FOLD}.csv", index=False
    )
    
    ## for debug, save predictions for train as well:
    ### debug train audios:
    train_debug_loader = torch.utils.data.DataLoader(sdf_train_list[FOLD], batch_size=BS, shuffle=False,
                                               num_workers=6, drop_last=False, pin_memory=True)
    train_out = get_val_outputs(model_ft, train_debug_loader)
    gt = train_folds[FOLD].label.map(label2code).values
    train_losses = list()

    for idx in range(len(gt)):
        item_loss = criterion(torch.Tensor([train_out[idx]]), torch.LongTensor([gt[idx]])).numpy()
        train_losses.append(item_loss)

    train_losses = np.array(train_losses)
    train_folds[FOLD]["loss"] = train_losses
    train_folds[FOLD]["preds"] = [code2label[c] for c in np.argmax(train_out, axis=1)]
    train_folds[FOLD].reset_index(drop=True).sort_values(by="loss", ascending=False).to_csv(
        f"tmp/{results_folder}/train_loss_{FOLD}.csv", index=False
    )
    
    ## MAKE SUBMISSION (from one model):
    sample_subm = pd.read_csv("Submission1.csv")
    sample_subm["image_fn"] = sample_subm.fn.apply(get_image_path)
    subm_dataset = SpectrogramTestDataset([[path, None] for path in sample_subm.image_fn.values ], conf)
    subm_loader = torch.utils.data.DataLoader(subm_dataset, batch_size=16)

    preds = get_predictions(model_ft, subm_loader)

    for c in sample_subm.columns:
        if c in {"fn", "image_fn"}:
            continue
        c_idx = label2code[c]
        sample_subm[c] = preds[:, c_idx]

    sample_subm.drop("image_fn", axis=1).to_csv(f'tmp/{results_folder}/subm_{FOLD}.csv', index=False)

PROCESSING FOLD 0

Test set: Average loss: 5.1759, Accuracy: 9/451 (2%)

Training so far 1.5153255303700766 minutes

Test set: Average loss: 4.1223, Accuracy: 39/451 (9%)

Training so far 3.0794251402219137 minutes

Test set: Average loss: 3.0141, Accuracy: 109/451 (24%)

Training so far 4.640584393342336 minutes

Test set: Average loss: 2.4041, Accuracy: 187/451 (41%)

Training so far 6.194824632008871 minutes

Test set: Average loss: 2.1588, Accuracy: 209/451 (46%)

Training so far 7.736872899532318 minutes

Test set: Average loss: 1.7141, Accuracy: 276/451 (61%)

Training so far 9.285128859678904 minutes

Test set: Average loss: 1.4866, Accuracy: 291/451 (65%)

Training so far 10.829892679055531 minutes

Test set: Average loss: 1.4261, Accuracy: 302/451 (67%)

Training so far 12.382770935694376 minutes

Test set: Average loss: 1.2637, Accuracy: 307/451 (68%)

Training so far 13.93421666622162 minutes

Test set: Average loss: 1.1649, Accuracy: 326/451 (72%)

Training so far 15.485812


Test set: Average loss: 0.5470, Accuracy: 403/451 (89%)

Training so far 77.31701451539993 minutes

Test set: Average loss: 0.5564, Accuracy: 400/451 (89%)

Training so far 78.87322177092234 minutes

Test set: Average loss: 0.5513, Accuracy: 402/451 (89%)

Training so far 80.42570188442866 minutes

Test set: Average loss: 0.5759, Accuracy: 395/451 (88%)

Training so far 81.97701182762782 minutes

Test set: Average loss: 0.5757, Accuracy: 394/451 (87%)

Training so far 83.52642560799917 minutes

Test set: Average loss: 0.5647, Accuracy: 392/451 (87%)

Epoch    55: reducing learning rate of group 0 to 3.7500e-05.
Training so far 85.07850575049719 minutes

Test set: Average loss: 0.5532, Accuracy: 398/451 (88%)

Training so far 86.62722442150115 minutes

Test set: Average loss: 0.5393, Accuracy: 400/451 (89%)

Training so far 88.17887292702993 minutes

Test set: Average loss: 0.5488, Accuracy: 399/451 (88%)

Training so far 89.72539122899373 minutes

Test set: Average loss: 0.5443, Accur



PROCESSING FOLD 1

Test set: Average loss: 4.5344, Accuracy: 25/451 (6%)

Training so far 1.5400746703147887 minutes

Test set: Average loss: 3.7370, Accuracy: 54/451 (12%)

Training so far 3.082967495918274 minutes

Test set: Average loss: 2.7583, Accuracy: 150/451 (33%)

Training so far 4.63525630235672 minutes

Test set: Average loss: 2.2827, Accuracy: 199/451 (44%)

Training so far 6.192296330134074 minutes

Test set: Average loss: 2.0577, Accuracy: 220/451 (49%)

Training so far 7.736361793677012 minutes

Test set: Average loss: 1.6227, Accuracy: 284/451 (63%)

Training so far 9.288157935937246 minutes

Test set: Average loss: 1.4419, Accuracy: 299/451 (66%)

Training so far 10.8269167582194 minutes

Test set: Average loss: 1.3076, Accuracy: 320/451 (71%)

Training so far 12.379441356658935 minutes

Test set: Average loss: 1.1965, Accuracy: 329/451 (73%)

Training so far 13.920428649584453 minutes

Test set: Average loss: 1.1393, Accuracy: 329/451 (73%)

Training so far 15.4762066


Test set: Average loss: 0.6624, Accuracy: 396/451 (88%)

Training so far 77.29701682329178 minutes

Test set: Average loss: 0.6607, Accuracy: 389/451 (86%)

Training so far 78.84668300946554 minutes

Test set: Average loss: 0.6909, Accuracy: 392/451 (87%)

Training so far 80.38284516334534 minutes

Test set: Average loss: 0.6849, Accuracy: 394/451 (87%)

Training so far 81.9317910472552 minutes

Test set: Average loss: 0.7016, Accuracy: 393/451 (87%)

Training so far 83.47660151720046 minutes

Test set: Average loss: 0.7016, Accuracy: 394/451 (87%)

Training so far 85.01809465487798 minutes

Test set: Average loss: 0.6794, Accuracy: 394/451 (87%)

Epoch    56: reducing learning rate of group 0 to 3.7500e-05.
Training so far 86.56346210241318 minutes

Test set: Average loss: 0.6466, Accuracy: 394/451 (87%)

Training so far 88.10194772084554 minutes

Test set: Average loss: 0.6630, Accuracy: 396/451 (88%)

Training so far 89.6516918738683 minutes

Test set: Average loss: 0.6575, Accurac


Test set: Average loss: 1.0865, Accuracy: 358/451 (79%)

Training so far 27.807952133814492 minutes

Test set: Average loss: 0.9718, Accuracy: 370/451 (82%)

Training so far 29.350396720568337 minutes

Test set: Average loss: 1.0613, Accuracy: 354/451 (78%)

Training so far 30.901277140776315 minutes

Test set: Average loss: 0.9594, Accuracy: 368/451 (82%)

Training so far 32.45164190928141 minutes

Test set: Average loss: 1.0916, Accuracy: 364/451 (81%)

Training so far 34.00664301713308 minutes

Test set: Average loss: 0.9610, Accuracy: 371/451 (82%)

Training so far 35.536835730075836 minutes

Test set: Average loss: 1.0178, Accuracy: 354/451 (78%)

Training so far 37.07965727647146 minutes

Test set: Average loss: 0.9615, Accuracy: 366/451 (81%)

Training so far 38.61759218374888 minutes

Test set: Average loss: 0.8939, Accuracy: 371/451 (82%)

Training so far 40.17730489571889 minutes

Test set: Average loss: 0.9437, Accuracy: 364/451 (81%)

Training so far 41.73939150571823 minu


Test set: Average loss: 0.7449, Accuracy: 389/451 (86%)

Training so far 102.07149009307226 minutes

Test set: Average loss: 0.7619, Accuracy: 390/451 (86%)

Training so far 103.61487487951915 minutes

Test set: Average loss: 0.7679, Accuracy: 387/451 (86%)

Training so far 105.15392274856568 minutes

Test set: Average loss: 0.7538, Accuracy: 391/451 (87%)

Training so far 106.70159174601237 minutes

Test set: Average loss: 0.7581, Accuracy: 391/451 (87%)

Training so far 108.25562470356623 minutes

Test set: Average loss: 0.7065, Accuracy: 392/451 (87%)

Training so far 109.80693054199219 minutes

Test set: Average loss: 0.7894, Accuracy: 390/451 (86%)

Training so far 111.34856045246124 minutes

Test set: Average loss: 0.7868, Accuracy: 389/451 (86%)

Training so far 112.88322848081589 minutes

Test set: Average loss: 0.8113, Accuracy: 387/451 (86%)

Training so far 114.42320979038874 minutes

Test set: Average loss: 0.8108, Accuracy: 391/451 (87%)

Training so far 115.9616791764895

In [26]:
[print(i/100) for i in [88.24833702882484,87.58314855875831,86.9179600886918]]

0.8824833702882483
0.8758314855875831
0.869179600886918


[None, None, None]

In [23]:
## MAKE ONE SUBMISSION FROM ALL SUBS:
import pandas as pd
import numpy as np

s = f'tmp/{results_folder}/'
preds_to_average = [
    f"subm_{f}.csv"
    for f in range(N_FOLDS)
]

all_subs = list()
source_pred = pd.read_csv(s + preds_to_average[0])
pred_cols = source_pred.drop("fn", axis=1).columns.values
for file in preds_to_average[1:]:
    tmp = pd.read_csv(s + file)
    source_pred[pred_cols] += tmp[pred_cols]

source_pred[pred_cols] /= len(preds_to_average)

In [24]:
np.all(np.isclose((source_pred[pred_cols].sum(axis=1)).values, np.ones(source_pred.shape[0])))

True

In [25]:
source_pred.to_csv(f'tmp/{results_folder}/{results_folder}_merged.csv', index=False)