In [1]:
import os
import time
import random
import warnings
import typing as tp
import pathlib
from contextlib import contextmanager

import cv2
import librosa

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import IPython.display as ipd

import torch.nn as nn
import torch.optim as optim

from torchvision import datasets, models, transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingLR
import torchaudio


  '"sox" backend is being deprecated. '


In [2]:
import python_speech_features as psf
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
data_path = pathlib.Path("../data")
audios_path = data_path / "all_audio_resampled"

In [4]:
train = pd.read_csv(data_path/'Train.csv')
train_extra = pd.read_csv(data_path/'train_add.csv')
train_extra_2 = pd.read_csv(data_path/'train_add_20201029.csv')

label2code = {word: idx for idx, word in enumerate(train.label.unique().tolist())}
code2label = {v:k for k,v in label2code.items()}

In [5]:
def get_image_path(audio_path):
    file_name = audio_path.split("/")[-1].split(".")[0]
    ip = str(audios_path.resolve() / f"{file_name}.wav")
    return ip

train["image_fn"] = train.fn.apply(get_image_path)
train_extra["image_fn"] = train_extra.fn.apply(get_image_path)
train_extra_2["image_fn"] = train_extra_2.fn.apply(get_image_path)

In [6]:
train_df = pd.concat([train, train_extra, train_extra_2], axis=0).reset_index(drop=True)

In [7]:
### build validation that includes all classes:


vcs = train_df.label.value_counts()

## possible schema:
# 25+ - take 3
# 12-25 - take 2
# 12- - take 1

def num_for_val(num_examples):
    if num_examples >= 25:
        return 3
    if num_examples >= 12:
        return 2
    return 1

train_df["num_examples"] = train_df.label.map(vcs.to_dict())
train_df["num_for_val"] = train_df.num_examples.apply(num_for_val)

random.seed(12)
train_df_new = pd.DataFrame()
for label in train_df.label.unique():
    tmp = train_df.loc[train_df.label == label].copy()
    tmp["dummy"] = tmp.label.apply(lambda _: random.random())
    tmp.sort_values(by="dummy", inplace=True)
    tmp["rank"] = range(tmp.shape[0])
    train_df_new = pd.concat([train_df_new, tmp])

train_df_new.reset_index(drop=True, inplace=True)
train_df_new["val_subset"] = train_df_new.num_for_val > train_df_new["rank"]
train_df_new.drop("dummy", axis=1, inplace=True)

In [8]:
assert (
    set(train_df_new.loc[train_df_new.val_subset].label.unique()) == 
    set(train_df_new.loc[~train_df_new.val_subset].label.unique())
)

In [9]:
train_df_new["val_fold"] = train_df_new["rank"] // train_df_new.num_for_val

In [10]:
for f in range(10):
    print(f, (train_df_new.val_fold == f).sum())

0 451
1 451
2 451
3 430
4 413
5 410
6 407
7 391
8 381
9 337


In [11]:
N_FOLDS = 3
train_folds = list()
val_folds = list()

for i in range(N_FOLDS):
    valf = train_df_new.loc[train_df_new.val_fold == i].copy()
    trf = train_df_new.loc[train_df_new.val_fold != i].copy()
    
    train_folds.append(trf)
    val_folds.append(valf)

In [12]:
SR = 22050

In [13]:
# from reproduce.src.data_processing import new_generate_spec, new_build_image, normalize, MEAN, STD

In [14]:
class MelSpecDataset(data.Dataset):
    def __init__(self,
                 audio_paths,
                 labels,
                 config,
                 transforms=None,
                 is_test=False
    ):
        self.audio_paths = audio_paths
        self.labels = labels
        self.config = config
        self.tr = transforms
        self.is_test = is_test
        self.n_channels = 3
        
        assert len(self.audio_paths) == len(self.labels)

        
    def __len__(self):
        return len(self.audio_paths)
    

    def __getitem__(self, idx):
        MEAN = torch.Tensor([0.485, 0.456, 0.406])
        STD = torch.Tensor([0.229, 0.224, 0.225])
        
        waveform, sample_rate = torchaudio.load(self.audio_paths[idx])
        specgram = torchaudio.transforms.MelSpectrogram(
            n_mels=self.config.n_mels, sample_rate=self.config.sr,
            n_fft=self.config.n_fft, hop_length=self.config.hop_size
        )(waveform[0])
        specgram = torchaudio.transforms.AmplitudeToDB()(specgram)
        
        if self.tr:
            specgram = self.tr(specgram)
  
        ## normalize:
        specgram -= specgram.min()
        specgram /= specgram.max()
        
        specgram = torch.stack([specgram for _ in range(self.n_channels)])
#         return specgram, self.labels[idx]
        if self.is_test:
            return (specgram - MEAN.reshape(3, 1, 1)) / STD.reshape(3,1,1)
        return (specgram - MEAN.reshape(3, 1, 1)) / STD.reshape(3,1,1), self.labels[idx]


In [15]:
from torchvision import transforms as tr

class PadToSize:
    """
    !! adds padding only to the last dimension !!
    !! ALSO can cut image
    """
    def __init__(self, target_size):
        self.target_size = target_size
    
    def __call__(self, img2d):
        if img2d.shape[-1] >= self.target_size[-1]:
            return img2d[:,:self.target_size[-1]]

        to_pad = self.target_size[-1] - img2d.shape[-1]
        pad_left = to_pad // 2
        pad_right = to_pad - pad_left
        return torch.nn.functional.pad(img2d, pad=(pad_left, pad_right))

class AudioConfig:
    n_mels = 64
    img_size = (n_mels, 440)
    sr = 22050
    n_fft = 512
    hop_size = 128

config = AudioConfig()

transforms = tr.Compose([
    PadToSize(target_size=config.img_size),
])

In [17]:
tr_spec_ds_list = [
    MelSpecDataset(
    t.image_fn.values,
    [label2code[i] for i in t.label.values],
    config=config,
    transforms=transforms
) for t in train_folds
]

val_spec_ds_list = [
    MelSpecDataset(
    t.image_fn.values,
    [label2code[i] for i in t.label.values],
    config=config,
    transforms=transforms
    ) for t in val_folds
]

for s in tr_spec_ds_list:
    print(len(s))
    
print("=========================")

for s in val_spec_ds_list:
    print(len(s))

4258
4258
4258
451
451
451


In [18]:
def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)


def mixup_data(x, y, alpha=1.0, use_cuda=True):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    if use_cuda:
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def train_mixup_epoch(log_interval, mixup_prob, model, device, criterion, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        use_mixup = False
        if random.random() < mixup_prob:
            use_mixup = True
        data, target = data.type(torch.FloatTensor).to(device), target.to(device)
        
        if use_mixup:
            data, y_a, y_b, lam = mixup_data(data, target)
        optimizer.zero_grad()
        output = model(data)
        if use_mixup:
            loss = mixup_criterion(criterion, output, y_a, y_b, lam) #criterion(output, target)
        else:
            loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
    return loss.item()

            
def test(model, device, criterion, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.type(torch.FloatTensor).to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    
    accuracy = 100. * correct / len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        accuracy))

    return test_loss, accuracy

In [19]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


def train_on_fold(
    model,
    train_loader,
    val_loader,
    fold,
    criterion,
    optimizer,
    lr_scheduler,
    results_folder,
    epochs
):
    seed_dict = {
        0: 9,
        1: 99,
        2: 999
    }
    set_seed(seed_dict[fold])

    t0 = time.time()

    best_loss = 1e5
    best_acc = 0

    max_patience = 20
    patience = 0

    train_loss_hist = list()
    val_loss_hist = list()
    val_acc_hist = list()

    save_each_epoch = False

    for ep in range(50):
        train_loss = train_mixup_epoch(1e10, 0.8, model, device, criterion, train_loader, optimizer, ep)
        cur_loss, cur_acc = test(model, device, criterion, val_loader)

        train_loss_hist.append(train_loss)
        val_loss_hist.append(cur_loss)
        val_acc_hist.append(cur_acc)

        if save_each_epoch:
            torch.save(model.state_dict(), f"tmp/{results_folder}/model_ep_{ep}.pth")

        if cur_loss < best_loss:
            torch.save(model.state_dict(), f"tmp/{results_folder}/best_run_{fold}.pth")
            best_loss = cur_loss
            best_acc = cur_acc
            patience = 0
        else:
            patience += 1
            print("patience:", patience)


        lr_scheduler.step() 
        print("Training so far {} minutes".format((time.time() - t0) / 60))
        print("="*20)

    optimizer = optim.Adam(model_ft.parameters(), lr=2e-5)
    for ep in range(15):
        train_loss = train_mixup_epoch(1e10, 0.0, model, device, criterion, train_loader, optimizer, ep)
        cur_loss, cur_acc = test(model, device, criterion, val_loader)

        train_loss_hist.append(train_loss)
        val_loss_hist.append(cur_loss)
        val_acc_hist.append(cur_acc)

        if save_each_epoch:
            torch.save(model.state_dict(), f"tmp/{results_folder}/model_ep_{ep}.pth")

        if cur_loss < best_loss:
            torch.save(model.state_dict(), f"tmp/{results_folder}/best_run_{fold}.pth")
            best_loss = cur_loss
            best_acc = cur_acc
            patience = 0
        else:
            patience += 1
            print("patience:", patience)

        print("Training so far {} minutes".format((time.time() - t0) / 60))
        print("="*20)


    print("time spent training: {} minutes".format((time.time() - t0) / 60))
    print("BEST LOSS:", best_loss)
    print("BEST ACC:", best_acc)
    
    model.load_state_dict(torch.load(f"tmp/{results_folder}/best_run_{fold}.pth"))
    
    return model


def predict_on_val(
    model,
    val_loader,
    device="cuda"
):
    sfm = nn.Softmax()
    predictions = list()
    for batch_idx, (inputs, _) in enumerate(val_loader):
        inputs = inputs.type(torch.FloatTensor).to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            predictions.append(sfm(outputs))

    predictions = np.concatenate([t.cpu().numpy() for t in predictions])
    preds = np.argmax(predictions, axis=1)
    return preds


def get_val_outputs(model, test_loader, device="cuda"):
    outputs_list = list()
    for batch_idx, (inputs, target) in enumerate(test_loader):
        inputs = inputs.type(torch.FloatTensor).to(device)

        # forward
        # track history if only in train
        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            outputs_list.append(outputs)
    outputs_list = np.concatenate([t.cpu().numpy() for t in outputs_list])
    return outputs_list

def get_predictions(model, test_loader, device="cuda"):
    sfm = nn.Softmax()
    predictions = list()
    for batch_idx, inputs in enumerate(test_loader):
        inputs = inputs.type(torch.FloatTensor).to(device)

        # forward
        # track history if only in train
        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            predictions.append(sfm(outputs)) ## ADD SOFTMAX
    predictions = np.concatenate([t.cpu().numpy() for t in predictions])
    return predictions

In [20]:
# model_ft.load_state_dict(torch.load(f"tmp/{tmp_folder_name}/best_run_{FOLD}.pth"))

In [21]:
import warnings
warnings.simplefilter("ignore", UserWarning)

In [24]:
n_classes = len(label2code)
device = torch.device("cuda")
BS = 128

results_folder = "melspec_resnet34_repr"
os.mkdir(f"tmp/{results_folder}")

for FOLD in range(3):
    print("PROCESSING FOLD", FOLD)
    model_ft = models.resnet34(pretrained=True)
    model_ft = model_ft.to(device)

    criterion = nn.CrossEntropyLoss(reduction="sum")
    optimizer_ft = optim.Adam(model_ft.parameters(), lr=5e-4)
    cosine_lrsche = CosineAnnealingLR(optimizer_ft, T_max=35, eta_min=2e-5, verbose=True)

    train_loader = torch.utils.data.DataLoader(tr_spec_ds_list[FOLD], batch_size=BS, shuffle=True,
                                               num_workers=6, drop_last=False, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_spec_ds_list[FOLD], batch_size=BS, drop_last=False, pin_memory=True,
                                        num_workers=6)
    
    # full train cycle with one fold for cross-val:
    model_ft = train_on_fold(
        model_ft,
        train_loader,
        val_loader,
        FOLD,
        criterion,
        optimizer_ft,
        cosine_lrsche,
        results_folder,
        None
    )
    
    # predict from one model:
    preds = predict_on_val(
        model_ft,
        val_loader
    )
    val_folds[FOLD]["preds"] = [code2label[c] for c in preds]
    
    # Save val predictions from one model:
    val_out = get_val_outputs(model_ft, val_loader)
    gt = val_folds[FOLD].label.map(label2code).values
    val_losses = list()
    
    for idx in range(len(gt)):
        item_loss = criterion(torch.Tensor([val_out[idx]]), torch.LongTensor([gt[idx]])).numpy()
        val_losses.append(item_loss)
    val_losses = np.array(val_losses)
    val_folds[FOLD]["loss"] = val_losses
    val_folds[FOLD].reset_index(drop=True).sort_values(by="loss", ascending=True).to_csv(
        f"tmp/{results_folder}/val_loss_{FOLD}.csv", index=False
    )
    
    ## for debug, save predictions for train as well:
    ### debug train audios:
    train_debug_loader = torch.utils.data.DataLoader(tr_spec_ds_list[FOLD], batch_size=BS, shuffle=False,
                                               num_workers=4, drop_last=False, pin_memory=True)
    train_out = get_val_outputs(model_ft, train_debug_loader)
    gt = train_folds[FOLD].label.map(label2code).values
    train_losses = list()

    for idx in range(len(gt)):
        item_loss = criterion(torch.Tensor([train_out[idx]]), torch.LongTensor([gt[idx]])).numpy()
        train_losses.append(item_loss)

    train_losses = np.array(train_losses)
    train_folds[FOLD]["loss"] = train_losses
    train_folds[FOLD]["preds"] = [code2label[c] for c in np.argmax(train_out, axis=1)]
    train_folds[FOLD].reset_index(drop=True).sort_values(by="loss", ascending=False).to_csv(
        f"tmp/{results_folder}/train_loss_{FOLD}.csv", index=False
    )
    
    ## MAKE SUBMISSION (from one model):
    sample_subm = pd.read_csv("Submission1.csv")
    sample_subm["image_fn"] = sample_subm.fn.apply(get_image_path)
#     subm_dataset = SpectrogramTestDataset([[path, None] for path in sample_subm.image_fn.values ], conf)
#     subm_loader = torch.utils.data.DataLoader(subm_dataset, batch_size=16)
    subm_dataset = MelSpecDataset(
        sample_subm.image_fn.values,
        [None for _ in sample_subm.image_fn.values],
        config=config,
        transforms=transforms,
        is_test=True
    )
    subm_loader = torch.utils.data.DataLoader(subm_dataset, batch_size=BS)

    preds = get_predictions(model_ft, subm_loader)

    for c in sample_subm.columns:
        if c in {"fn", "image_fn"}:
            continue
        c_idx = label2code[c]
        sample_subm[c] = preds[:, c_idx]

    sample_subm.drop("image_fn", axis=1).to_csv(f'tmp/{results_folder}/subm_{FOLD}.csv', index=False)

PROCESSING FOLD 1
Adjusting learning rate of group 0 to 5.0000e-04.

Test set: Average loss: 5.2415, Accuracy: 22/451 (5%)

Adjusting learning rate of group 0 to 4.9903e-04.
Training so far 0.14472198486328125 minutes

Test set: Average loss: 3.8227, Accuracy: 94/451 (21%)

Adjusting learning rate of group 0 to 4.9614e-04.
Training so far 0.28986985683441163 minutes

Test set: Average loss: 2.1459, Accuracy: 223/451 (49%)

Adjusting learning rate of group 0 to 4.9135e-04.
Training so far 0.43508367935816444 minutes

Test set: Average loss: 1.9347, Accuracy: 267/451 (59%)

Adjusting learning rate of group 0 to 4.8470e-04.
Training so far 0.5819736083348592 minutes

Test set: Average loss: 1.6516, Accuracy: 287/451 (64%)

Adjusting learning rate of group 0 to 4.7623e-04.
Training so far 0.7296151002248128 minutes

Test set: Average loss: 1.4165, Accuracy: 322/451 (71%)

Adjusting learning rate of group 0 to 4.6603e-04.
Training so far 0.8730579098065694 minutes

Test set: Average loss: 1


Test set: Average loss: 1.0221, Accuracy: 369/451 (82%)

patience: 13
Adjusting learning rate of group 0 to 2.3857e-05.
Training so far 5.437586236000061 minutes

Test set: Average loss: 0.9459, Accuracy: 370/451 (82%)

patience: 14
Adjusting learning rate of group 0 to 2.8649e-05.
Training so far 5.584317914644877 minutes

Test set: Average loss: 1.0058, Accuracy: 370/451 (82%)

patience: 15
Adjusting learning rate of group 0 to 3.5304e-05.
Training so far 5.730286939938863 minutes

Test set: Average loss: 1.0060, Accuracy: 375/451 (83%)

patience: 16
Adjusting learning rate of group 0 to 4.3767e-05.
Training so far 5.877453859647115 minutes

Test set: Average loss: 0.9930, Accuracy: 367/451 (81%)

patience: 17
Adjusting learning rate of group 0 to 5.3972e-05.
Training so far 6.023726634184519 minutes

Test set: Average loss: 1.0120, Accuracy: 372/451 (82%)

patience: 18
Adjusting learning rate of group 0 to 6.5836e-05.
Training so far 6.166677272319793 minutes

Test set: Average los


Test set: Average loss: 1.2861, Accuracy: 340/451 (75%)

patience: 2
Adjusting learning rate of group 0 to 3.9222e-04.
Training so far 1.62764630317688 minutes

Test set: Average loss: 1.1338, Accuracy: 354/451 (78%)

Adjusting learning rate of group 0 to 3.7373e-04.
Training so far 1.7753538052241007 minutes

Test set: Average loss: 1.2333, Accuracy: 360/451 (80%)

patience: 1
Adjusting learning rate of group 0 to 3.5433e-04.
Training so far 1.9240917046864827 minutes

Test set: Average loss: 1.1633, Accuracy: 366/451 (81%)

patience: 2
Adjusting learning rate of group 0 to 3.3416e-04.
Training so far 2.0672542532285054 minutes

Test set: Average loss: 1.0300, Accuracy: 368/451 (82%)

Adjusting learning rate of group 0 to 3.1341e-04.
Training so far 2.216125778357188 minutes

Test set: Average loss: 1.1000, Accuracy: 363/451 (80%)

patience: 1
Adjusting learning rate of group 0 to 2.9222e-04.
Training so far 2.3597185055414838 minutes

Test set: Average loss: 1.1131, Accuracy: 366/45


Test set: Average loss: 1.1290, Accuracy: 371/451 (82%)

patience: 24
Adjusting learning rate of group 0 to 1.4627e-04.
Training so far 6.888757638136545 minutes

Test set: Average loss: 1.2979, Accuracy: 352/451 (78%)

patience: 25
Adjusting learning rate of group 0 to 1.6567e-04.
Training so far 7.0360921422640486 minutes

Test set: Average loss: 1.1617, Accuracy: 364/451 (81%)

patience: 26
Adjusting learning rate of group 0 to 1.8584e-04.
Training so far 7.185854025681814 minutes

Test set: Average loss: 1.1923, Accuracy: 364/451 (81%)

patience: 27
Adjusting learning rate of group 0 to 2.0659e-04.
Training so far 7.330053508281708 minutes

Test set: Average loss: 0.8193, Accuracy: 381/451 (84%)

Training so far 7.478242917855581 minutes

Test set: Average loss: 0.8111, Accuracy: 384/451 (85%)

Training so far 7.629748845100403 minutes

Test set: Average loss: 0.8179, Accuracy: 383/451 (85%)

patience: 1
Training so far 7.776867059866587 minutes

Test set: Average loss: 0.8161, Ac

In [25]:
## MAKE ONE SUBMISSION FROM ALL SUBS:
import pandas as pd
import numpy as np

s = f'tmp/{results_folder}/'
preds_to_average = [
    f"subm_{f}.csv"
    for f in range(N_FOLDS)
]

all_subs = list()
source_pred = pd.read_csv(s + preds_to_average[0])
pred_cols = source_pred.drop("fn", axis=1).columns.values
for file in preds_to_average[1:]:
    tmp = pd.read_csv(s + file)
    source_pred[pred_cols] += tmp[pred_cols]

source_pred[pred_cols] /= len(preds_to_average)

In [30]:
source_pred.to_csv(f'tmp/{results_folder}/{results_folder}_merged.csv', index=False)