In [1]:
import os
import time
import random
import warnings
import typing as tp
import pathlib
from contextlib import contextmanager

import cv2
import librosa

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import IPython.display as ipd

import torch.nn as nn
import torch.optim as optim

from torchvision import datasets, models, transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau


In [2]:
import python_speech_features as psf
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
data_path = pathlib.Path("../data")
audios_path = data_path / "all_audio_resampled"

In [4]:
train = pd.read_csv(data_path/'Train.csv')
train_extra = pd.read_csv(data_path/'train_add.csv')
train_extra_2 = pd.read_csv(data_path/'train_add_20201029.csv')

label2code = {word: idx for idx, word in enumerate(train.label.unique().tolist())}
code2label = {v:k for k,v in label2code.items()}

In [5]:
def get_image_path(audio_path):
    file_name = audio_path.split("/")[-1].split(".")[0]
    ip = str(audios_path.resolve() / f"{file_name}.wav")
    return ip

train["image_fn"] = train.fn.apply(get_image_path)
train_extra["image_fn"] = train_extra.fn.apply(get_image_path)
train_extra_2["image_fn"] = train_extra_2.fn.apply(get_image_path)

In [6]:
train_df = pd.concat([train, train_extra, train_extra_2], axis=0).reset_index(drop=True)

In [7]:
### build validation that includes all classes:


vcs = train_df.label.value_counts()

## possible schema:
# 25+ - take 3
# 12-25 - take 2
# 12- - take 1

def num_for_val(num_examples):
    if num_examples >= 25:
        return 3
    if num_examples >= 12:
        return 2
    return 1

train_df["num_examples"] = train_df.label.map(vcs.to_dict())
train_df["num_for_val"] = train_df.num_examples.apply(num_for_val)

random.seed(12)
train_df_new = pd.DataFrame()
for label in train_df.label.unique():
    tmp = train_df.loc[train_df.label == label].copy()
    tmp["dummy"] = tmp.label.apply(lambda _: random.random())
    tmp.sort_values(by="dummy", inplace=True)
    tmp["rank"] = range(tmp.shape[0])
    train_df_new = pd.concat([train_df_new, tmp])

train_df_new.reset_index(drop=True, inplace=True)
train_df_new["val_subset"] = train_df_new.num_for_val > train_df_new["rank"]
train_df_new.drop("dummy", axis=1, inplace=True)

In [8]:
assert (
    set(train_df_new.loc[train_df_new.val_subset].label.unique()) == 
    set(train_df_new.loc[~train_df_new.val_subset].label.unique())
)

In [9]:
train_df_new["val_fold"] = train_df_new["rank"] // train_df_new.num_for_val

In [10]:
for f in range(10):
    print(f, (train_df_new.val_fold == f).sum())

0 451
1 451
2 451
3 430
4 413
5 410
6 407
7 391
8 381
9 337


In [11]:
N_FOLDS = 3
train_folds = list()
val_folds = list()

for i in range(N_FOLDS):
    valf = train_df_new.loc[train_df_new.val_fold == i].copy()
    trf = train_df_new.loc[train_df_new.val_fold != i].copy()
    
    train_folds.append(trf)
    val_folds.append(valf)

In [12]:
SR = 22050

In [13]:
from data_processing import new_generate_spec, new_build_image, normalize, MEAN, STD
from transforms import *

In [14]:
class AudioConfig:
    n_fft = 512
    hop_size = 32
    pad_center = True
    trim = True
    max_len_sec = 2.6
    sr = 22050
    img_size = 299
    
conf = AudioConfig()

In [15]:
aug_noise = AddNoise(0, 0.07)
# aug_ts = TimeStretch((0.5, 2))
aug_pitch = PitchShift((-5, 5), sr=conf.sr)

train_transforms = UseWithProb(
    OneOf([
        aug_noise,
        aug_pitch
    ]),
    prob=0.5
)

In [16]:
class SpectrogramDataset(data.Dataset):
    def __init__(
        self,
        file_list: tp.List[tp.List[str]],
        config,
        transform=None,
        normalize=True
    ):
        self.file_list = file_list  # list of list: [file_path, label]
        self.transform = transform
        self.normalize = normalize
        self.config = config

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx: int):
        fn, word = self.file_list[idx]
        audio, _ = librosa.core.load(fn, sr=SR)

        if self.transform:
            audio = self.transform(audio)

        image = new_build_image(audio, self.config)
        
        if self.normalize:
            norm_image = normalize(np.array(image), mean=MEAN, std=STD)
        else:
            norm_image = image
        
        return np.moveaxis(norm_image, 2, 0), label2code[word]
    
    
class SpectrogramTestDataset(data.Dataset):
    def __init__(
        self,
        file_list: tp.List[tp.List[str]],
        config,
        transform=None,
        normalize=True
    ):
        self.file_list = file_list  # list of list: [file_path, label]
        self.transform = transform
        self.normalize = normalize
        self.config = config

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx: int):
        fn, word = self.file_list[idx]
        audio, _ = librosa.core.load(fn, sr=SR)

        if self.transform:
            audio = self.transform(audio)

        image = new_build_image(audio, self.config)
        
        if self.normalize:
            norm_image = normalize(np.array(image), mean=MEAN, std=STD)
        else:
            norm_image = image
        
        return np.moveaxis(norm_image, 2, 0)

In [17]:
sdf_train_list = [
    SpectrogramDataset(t[["image_fn", "label"]].values.tolist(), conf,
                       transform=train_transforms, normalize=True)
    for t in train_folds
]

sdf_val_list = [
    SpectrogramDataset(v[["image_fn", "label"]].values.tolist(), conf, normalize=True)
    for v in val_folds
]

for s in sdf_train_list:
    print(len(s))
    
print("=========================")

for s in sdf_val_list:
    print(len(s))

4258
4258
4258
451
451
451


In [18]:
def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

In [19]:
def mixup_data(x, y, alpha=1.0, use_cuda=True):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    if use_cuda:
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def train_mixup_epoch(log_interval, mixup_prob, model, device, criterion, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        use_mixup = False
        if random.random() < mixup_prob:
            use_mixup = True
        data, target = data.type(torch.FloatTensor).to(device), target.to(device)
        
        if use_mixup:
            data, y_a, y_b, lam = mixup_data(data, target)
        optimizer.zero_grad()
        output = model(data)
        if use_mixup:
            loss = mixup_criterion(criterion, output, y_a, y_b, lam) #criterion(output, target)
        else:
            loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
    return loss.item()

            
def test(model, device, criterion, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.type(torch.FloatTensor).to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    
    accuracy = 100. * correct / len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        accuracy))

    return test_loss, accuracy

In [20]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


def train_on_fold(
    model,
    train_loader,
    val_loader,
    fold,
    criterion,
    optimizer,
    lr_scheduler,
    results_folder,
    epochs
):
    seed_dict = {
        0: 9,
        1: 99,
        2: 999
    }
    set_seed(seed_dict[fold])

    t0 = time.time()

    best_loss = 1e5
    best_acc = 0

    max_patience = 20
    patience = 0

    train_loss_hist = list()
    val_loss_hist = list()
    val_acc_hist = list()

    save_each_epoch = False

    for ep in range(65):
        train_loss = train_mixup_epoch(1e10, 0.667, model, device, criterion, train_loader, optimizer, ep)
        cur_loss, cur_acc = test(model, device, criterion, val_loader)

        train_loss_hist.append(train_loss)
        val_loss_hist.append(cur_loss)
        val_acc_hist.append(cur_acc)

        if save_each_epoch:
            torch.save(model.state_dict(), f"tmp/{results_folder}/model_ep_{fold}_{ep}.pth")

        if cur_loss < best_loss:
            torch.save(model.state_dict(), f"tmp/{results_folder}/best_run_{fold}.pth")
            best_loss = cur_loss
            best_acc = cur_acc
            patience = 0

        lr_scheduler.step(cur_loss) 
        print("Training so far {} minutes".format((time.time() - t0) / 60))
        print("="*20)
        
    for ep in range(13):
        train_loss = train_mixup_epoch(1e10, 0.0, model, device, criterion, train_loader, optimizer, ep)
        cur_loss, cur_acc = test(model, device, criterion, val_loader)

        train_loss_hist.append(train_loss)
        val_loss_hist.append(cur_loss)
        val_acc_hist.append(cur_acc)

        if save_each_epoch:
            torch.save(model.state_dict(), f"tmp/{results_folder}/model_ep_{fold}_{ep}.pth")

        if cur_loss < best_loss:
            torch.save(model.state_dict(), f"tmp/{results_folder}/best_run_{fold}.pth")
            best_loss = cur_loss
            best_acc = cur_acc
            patience = 0

        lr_scheduler.step(cur_loss)
        print("Training so far {} minutes".format((time.time() - t0) / 60))
        print("="*20)

    print("time spent training: {} minutes".format((time.time() - t0) / 60))
    print("BEST LOSS:", best_loss)
    print("BEST ACC:", best_acc)
    
    model.load_state_dict(torch.load(f"tmp/{results_folder}/best_run_{fold}.pth"))
    
    return model


def predict_on_val(
    model,
    val_loader,
    device="cuda"
):
    sfm = nn.Softmax()
    predictions = list()
    for batch_idx, (inputs, _) in enumerate(val_loader):
        inputs = inputs.type(torch.FloatTensor).to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            predictions.append(sfm(outputs))

    predictions = np.concatenate([t.cpu().numpy() for t in predictions])
    preds = np.argmax(predictions, axis=1)
    return preds


def get_val_outputs(model, test_loader, device="cuda"):
    outputs_list = list()
    for batch_idx, (inputs, target) in enumerate(test_loader):
        inputs = inputs.type(torch.FloatTensor).to(device)

        # forward
        # track history if only in train
        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            outputs_list.append(outputs)
    outputs_list = np.concatenate([t.cpu().numpy() for t in outputs_list])
    return outputs_list

def get_predictions(model, test_loader, device="cuda"):
    sfm = nn.Softmax()
    predictions = list()
    for batch_idx, inputs in enumerate(test_loader):
        inputs = inputs.type(torch.FloatTensor).to(device)

        # forward
        # track history if only in train
        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            predictions.append(sfm(outputs)) ## ADD SOFTMAX
    predictions = np.concatenate([t.cpu().numpy() for t in predictions])
    return predictions

In [21]:
# model_ft.load_state_dict(torch.load(f"tmp/{tmp_folder_name}/best_run_{FOLD}.pth"))

In [22]:
n_classes = len(label2code)
device = torch.device("cuda")

results_folder = "dens161_aug_mixup_repr_folds"
os.mkdir(f"tmp/{results_folder}")
BS = 16
for FOLD in range(N_FOLDS):
    print("PROCESSING FOLD", FOLD)
    
    model_ft = models.densenet161(pretrained=True)
    model_ft.classifier = nn.Sequential(
        nn.Linear(model_ft.classifier.in_features, n_classes)
    )
    model_ft = model_ft.to(device)

    criterion = nn.CrossEntropyLoss(reduction="sum")
    optimizer_ft = optim.Adam(model_ft.parameters(), lr=3e-4)
    lr_scheduler = ReduceLROnPlateau(optimizer_ft, 'min', patience=4, factor=0.5, verbose=True, min_lr=1.5e-5)

    train_loader = torch.utils.data.DataLoader(sdf_train_list[FOLD], batch_size=BS, shuffle=True,
                                               num_workers=6, drop_last=False, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(sdf_val_list[FOLD], batch_size=BS, drop_last=False, pin_memory=True,
                                        num_workers=6)
    
    # full train cycle with one fold for cross-val:
    model_ft = train_on_fold(
        model_ft,
        train_loader,
        val_loader,
        FOLD,
        criterion,
        optimizer_ft,
        lr_scheduler,
        results_folder,
        None
    )
    
    # predict from one model:
    preds = predict_on_val(
        model_ft,
        val_loader
    )
    val_folds[FOLD]["preds"] = [code2label[c] for c in preds]
    
    # Save val predictions from one model:
    val_out = get_val_outputs(model_ft, val_loader)
    gt = val_folds[FOLD].label.map(label2code).values
    val_losses = list()
    
    for idx in range(len(gt)):
        item_loss = criterion(torch.Tensor([val_out[idx]]), torch.LongTensor([gt[idx]])).numpy()
        val_losses.append(item_loss)
    val_losses = np.array(val_losses)
    val_folds[FOLD]["loss"] = val_losses
    val_folds[FOLD].reset_index(drop=True).sort_values(by="loss", ascending=True).to_csv(
        f"tmp/{results_folder}/val_loss_{FOLD}.csv", index=False
    )
    
    ## for debug, save predictions for train as well:
    ### debug train audios:
    train_debug_loader = torch.utils.data.DataLoader(sdf_train_list[FOLD], batch_size=BS, shuffle=False,
                                               num_workers=6, drop_last=False, pin_memory=True)
    train_out = get_val_outputs(model_ft, train_debug_loader)
    gt = train_folds[FOLD].label.map(label2code).values
    train_losses = list()

    for idx in range(len(gt)):
        item_loss = criterion(torch.Tensor([train_out[idx]]), torch.LongTensor([gt[idx]])).numpy()
        train_losses.append(item_loss)

    train_losses = np.array(train_losses)
    train_folds[FOLD]["loss"] = train_losses
    train_folds[FOLD]["preds"] = [code2label[c] for c in np.argmax(train_out, axis=1)]
    train_folds[FOLD].reset_index(drop=True).sort_values(by="loss", ascending=False).to_csv(
        f"tmp/{results_folder}/train_loss_{FOLD}.csv", index=False
    )
    
    ## MAKE SUBMISSION (from one model):
    sample_subm = pd.read_csv("Submission1.csv")
    sample_subm["image_fn"] = sample_subm.fn.apply(get_image_path)
    subm_dataset = SpectrogramTestDataset([[path, None] for path in sample_subm.image_fn.values ], conf)
    subm_loader = torch.utils.data.DataLoader(subm_dataset, batch_size=16)

    preds = get_predictions(model_ft, subm_loader)

    for c in sample_subm.columns:
        if c in {"fn", "image_fn"}:
            continue
        c_idx = label2code[c]
        sample_subm[c] = preds[:, c_idx]

    sample_subm.drop("image_fn", axis=1).to_csv(f'tmp/{results_folder}/subm_{FOLD}.csv', index=False)

PROCESSING FOLD 0

Test set: Average loss: 5.1194, Accuracy: 7/451 (2%)

Training so far 2.05168004433314 minutes

Test set: Average loss: 4.9568, Accuracy: 10/451 (2%)

Training so far 4.0801282326380415 minutes

Test set: Average loss: 3.8566, Accuracy: 55/451 (12%)

Training so far 6.1255564053853355 minutes

Test set: Average loss: 2.8535, Accuracy: 146/451 (32%)

Training so far 8.157116504510244 minutes

Test set: Average loss: 2.4045, Accuracy: 184/451 (41%)

Training so far 10.184798630078634 minutes

Test set: Average loss: 1.9216, Accuracy: 244/451 (54%)

Training so far 12.212976503372193 minutes

Test set: Average loss: 1.6596, Accuracy: 264/451 (59%)

Training so far 14.238748053709665 minutes

Test set: Average loss: 1.4577, Accuracy: 286/451 (63%)

Training so far 16.266295731067657 minutes

Test set: Average loss: 1.3686, Accuracy: 298/451 (66%)

Training so far 18.2956493973732 minutes

Test set: Average loss: 1.1476, Accuracy: 335/451 (74%)

Training so far 20.3207957


Test set: Average loss: 0.5301, Accuracy: 404/451 (90%)

Training so far 99.90884263515473 minutes

Test set: Average loss: 0.4968, Accuracy: 403/451 (89%)

Training so far 101.9752784371376 minutes

Test set: Average loss: 0.5307, Accuracy: 402/451 (89%)

Training so far 104.04954788684844 minutes

Test set: Average loss: 0.5057, Accuracy: 401/451 (89%)

Training so far 106.12460040648779 minutes

Test set: Average loss: 0.5114, Accuracy: 403/451 (89%)

Training so far 108.21576320330301 minutes

Test set: Average loss: 0.5162, Accuracy: 400/451 (89%)

Training so far 110.28395759661993 minutes

Test set: Average loss: 0.5136, Accuracy: 403/451 (89%)

Epoch    55: reducing learning rate of group 0 to 1.8750e-05.
Training so far 112.33400932947795 minutes

Test set: Average loss: 0.4966, Accuracy: 404/451 (90%)

Training so far 114.37134552399317 minutes

Test set: Average loss: 0.5146, Accuracy: 405/451 (90%)

Training so far 116.41065649191539 minutes

Test set: Average loss: 0.4896



PROCESSING FOLD 1

Test set: Average loss: 4.8791, Accuracy: 13/451 (3%)

Training so far 2.0217489361763 minutes

Test set: Average loss: 3.9367, Accuracy: 48/451 (11%)

Training so far 4.046665239334106 minutes

Test set: Average loss: 2.9569, Accuracy: 109/451 (24%)

Training so far 6.073913860321045 minutes

Test set: Average loss: 2.3332, Accuracy: 210/451 (47%)

Training so far 8.099962306022643 minutes

Test set: Average loss: 1.9101, Accuracy: 259/451 (57%)

Training so far 10.123463082313538 minutes

Test set: Average loss: 2.0869, Accuracy: 232/451 (51%)

Training so far 12.145927826563517 minutes

Test set: Average loss: 1.4968, Accuracy: 302/451 (67%)

Training so far 14.166549062728881 minutes

Test set: Average loss: 1.5435, Accuracy: 292/451 (65%)

Training so far 16.185982330640158 minutes

Test set: Average loss: 1.2962, Accuracy: 311/451 (69%)

Training so far 18.21221826473872 minutes

Test set: Average loss: 1.1159, Accuracy: 332/451 (74%)

Training so far 20.238152

Training so far 99.03923189242681 minutes

Test set: Average loss: 0.7523, Accuracy: 386/451 (86%)

Training so far 101.05741116205851 minutes

Test set: Average loss: 0.7019, Accuracy: 392/451 (87%)

Training so far 103.08222225109736 minutes

Test set: Average loss: 0.6980, Accuracy: 395/451 (88%)

Training so far 105.10726848045985 minutes

Test set: Average loss: 0.7615, Accuracy: 387/451 (86%)

Training so far 107.12340687115987 minutes

Test set: Average loss: 0.6592, Accuracy: 395/451 (88%)

Training so far 109.15424285332362 minutes

Test set: Average loss: 0.6812, Accuracy: 395/451 (88%)

Training so far 111.17146476507187 minutes

Test set: Average loss: 0.6886, Accuracy: 396/451 (88%)

Training so far 113.18970869382223 minutes

Test set: Average loss: 0.7045, Accuracy: 395/451 (88%)

Training so far 115.21259454488754 minutes

Test set: Average loss: 0.7213, Accuracy: 390/451 (86%)

Training so far 117.23500219186147 minutes

Test set: Average loss: 0.7456, Accuracy: 389/45


Test set: Average loss: 0.8277, Accuracy: 369/451 (82%)

Training so far 38.45468186934789 minutes

Test set: Average loss: 0.9610, Accuracy: 366/451 (81%)

Training so far 40.47860901355743 minutes

Test set: Average loss: 0.9020, Accuracy: 369/451 (82%)

Training so far 42.499258784453076 minutes

Test set: Average loss: 1.0058, Accuracy: 364/451 (81%)

Training so far 44.51802523136139 minutes

Test set: Average loss: 0.8661, Accuracy: 376/451 (83%)

Training so far 46.53637586037318 minutes

Test set: Average loss: 0.8900, Accuracy: 370/451 (82%)

Epoch    24: reducing learning rate of group 0 to 1.5000e-04.
Training so far 48.55705139239629 minutes

Test set: Average loss: 0.7960, Accuracy: 382/451 (85%)

Training so far 50.5784632841746 minutes

Test set: Average loss: 0.8221, Accuracy: 380/451 (84%)

Training so far 52.591485973199205 minutes

Test set: Average loss: 0.8665, Accuracy: 382/451 (85%)

Training so far 54.6091385046641 minutes

Test set: Average loss: 0.8257, Accur


Test set: Average loss: 0.8104, Accuracy: 390/451 (86%)

Training so far 133.41190232833227 minutes

Test set: Average loss: 0.7948, Accuracy: 389/451 (86%)

Training so far 135.43503175179163 minutes

Test set: Average loss: 0.7858, Accuracy: 389/451 (86%)

Training so far 137.46165594259898 minutes

Test set: Average loss: 0.7897, Accuracy: 390/451 (86%)

Training so far 139.48603877226512 minutes

Test set: Average loss: 0.8108, Accuracy: 391/451 (87%)

Training so far 141.50764235655467 minutes

Test set: Average loss: 0.8072, Accuracy: 390/451 (86%)

Training so far 143.53094732761383 minutes

Test set: Average loss: 0.8239, Accuracy: 393/451 (87%)

Training so far 145.54829260905584 minutes

Test set: Average loss: 0.8355, Accuracy: 392/451 (87%)

Training so far 147.56217298905054 minutes

Test set: Average loss: 0.8236, Accuracy: 390/451 (86%)

Training so far 149.5838489731153 minutes

Test set: Average loss: 0.8432, Accuracy: 387/451 (86%)

Training so far 151.60710186560948

In [1]:
[print(i/100) for i in [86.0310421286031,88.470066518847,86.0310421286031]]

0.8603104212860311
0.88470066518847
0.8603104212860311


[None, None, None]

In [24]:
## MAKE ONE SUBMISSION FROM ALL SUBS:
import pandas as pd
import numpy as np

s = f'tmp/{results_folder}/'
preds_to_average = [
    f"subm_{f}.csv"
    for f in range(N_FOLDS)
]

all_subs = list()
source_pred = pd.read_csv(s + preds_to_average[0])
pred_cols = source_pred.drop("fn", axis=1).columns.values
for file in preds_to_average[1:]:
    tmp = pd.read_csv(s + file)
    source_pred[pred_cols] += tmp[pred_cols]

source_pred[pred_cols] /= len(preds_to_average)

In [25]:
np.all(np.isclose((source_pred[pred_cols].sum(axis=1)).values, np.ones(source_pred.shape[0])))

True

In [26]:
source_pred.to_csv(f'tmp/{results_folder}/{results_folder}_merged.csv', index=False)