In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')
import os
import pickle
import torch
import pandas as pd
from torch import nn
from functools import partial
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split, Subset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import pytorch_lightning as pl

from models.models import MLPModel, ResNetBigger

In [3]:
from IPython.display import clear_output
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from constants import cloud_data_path, audioset_data_path
from dataset import SwitchBoardLaughterDataset
from audio_utils import featurize_mfcc, featurize_melspec

In [4]:
class System(pl.LightningModule):
    def __init__(self, model_name, model_hparams={}, optimizer_name='adam', optimizer_hparams={}):
        """
        Inputs:
            model_name - Name of the model/CNN to run. Used for creating the model (see function below)
            model_hparams - Hyperparameters for the model, as dictionary.
            optimizer_name - Name of the optimizer to use. Currently supported: Adam, SGD
            optimizer_hparams - Hyperparameters for the optimizer, as dictionary. This includes learning rate, weight decay, etc.
        """
        super().__init__()

        # Exports the hyperparameters to a YAML file, and create "self.hparams" namespace
        self.save_hyperparameters()

        self.model = {
            'mlp': MLPModel(),
            'resnet': ResNetBigger(linear_layer_size=64, filter_sizes=[64,32,16,16])
        }[model_name]

    def forward(self, x):
        # in lightning, forward defines the prediction/inference actions
        return self.model(x)

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        X, Y = batch

        output = self.model(X).squeeze()
        loss = F.binary_cross_entropy_with_logits(output, Y.float())

        # Logging to TensorBoard by default
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=.001)
        return optimizer

    def validation_step(self, batch, batch_idx):
        X, Y = batch

        output = self.model(X).squeeze()
        val_loss = F.binary_cross_entropy_with_logits(output, Y.float())
        self.log('val_loss', val_loss)

        return (output, Y.squeeze())

    def validation_epoch_end(self, validation_step_outputs):
        all_outputs = torch.cat([o[0] for o in validation_step_outputs]).cpu()
        all_labels = torch.cat([o[1] for o in validation_step_outputs]).cpu()

        try:
            val_auc = roc_auc_score(all_labels, all_outputs)
            self.log('val_auc', val_auc)
        except ValueError:
            pass

    def test_step(self, batch, batch_idx):
        X, Y = batch

        output = self.model(X).squeeze()

        return (output, Y.squeeze())

    def test_epoch_end(self, test_step_outputs):
        all_outputs = torch.cat([o[0] for o in test_step_outputs]).cpu()
        all_labels = torch.cat([o[1] for o in test_step_outputs]).cpu()

        self.test_results = {'proba': all_outputs, 'labels': all_labels}
        try:
            test_auc = roc_auc_score(all_labels, all_outputs)
            self.test_results['auc'] = test_auc
            self.log('test_auc', test_auc)
        except ValueError:
            pass

In [5]:
def do_fold(train_ds, test_ds, model_name='resnet', trainer_params={}):
    # data loaders
    data_loader_train = torch.utils.data.DataLoader(
        train_ds, batch_size=100, shuffle=True, num_workers=10,
        collate_fn=None)
    data_loader_val = torch.utils.data.DataLoader(
        test_ds, batch_size=100, shuffle=False, num_workers=10,
        collate_fn=None)

    system = System(model_name)
    trainer_fn = partial(pl.Trainer, **trainer_params)
    trainer = trainer_fn(
        callbacks=[EarlyStopping(monitor="val_loss", mode="min")] + trainer_params.get('callbacks', []),
        accelerator='gpu',
        log_every_n_steps=1,
        max_epochs=-1)
    trainer.fit(system, data_loader_train, data_loader_val)

    trainer.test(system, data_loader_val)
    return system.test_results

In [6]:
def get_metrics(outputs, labels, type='binary'):
    if type == 'binary':
        proba = torch.sigmoid(outputs)
        pred = (proba > 0.5)

        correct = pred.eq(outputs.bool()).sum().item()
        return {
            'auc': roc_auc_score(labels, proba),
            'correct': correct
        }
    elif type == 'regression':
        return {
            'mse': torch.nn.functional.mse_loss(outputs, labels, reduction='mean'),
            'l1': torch.nn.functional.l1_loss(outputs, labels, reduction='mean')
        }

In [7]:
def do_run(dataset, model_name, metrics_name='binary'):
    
    seed = 22
    cv_splits = KFold(n_splits=2, random_state=seed, shuffle=True).split(range(len(ds)))

    outputs = torch.empty((len(ds),))
    for f, (train_idx, test_idx) in enumerate(cv_splits):
        # create datasets    
        train_ds = Subset(dataset, train_idx)
        test_ds = Subset(dataset, test_idx)

        fold_outputs = do_fold(train_ds, test_ds, model_name)
        outputs[test_idx] = fold_outputs['proba'].cpu()
        clear_output(wait=True)

    labels = torch.Tensor(ds.get_all_labels())
    run_metrics = get_metrics(outputs, labels, metrics_name)
    return outputs, run_metrics

In [23]:
# dataset loading
audioset_examples = pd.read_csv('./data/audioset/examples.csv')
audioset_audios = pickle.load(open(os.path.join(audioset_data_path, 'audioset_audios.pkl'), 'rb'))

In [24]:
asds = SwitchBoardLaughterDataset(
    df=audioset_examples,
    audios=audioset_audios,
    feature_fn=partial(featurize_melspec, hop_length=186),
    sr=8000,
    subsample_length=1,
    id_column='yt_id',
    label_column='laughter')

df: 19354, audios: 15947, not found: 3407
df: 15947, audios: 15947, not found: 3407


In [26]:
asds[0][0].shape

(1, 44, 128)

# Train the audioset model

In [5]:
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import ShuffleSplit

In [6]:
# set seeds
pl.utilities.seed.seed_everything(22)

Global seed set to 22


22

In [7]:
# saves top-K checkpoints based on "val_loss" metric
checkpoint_callback = ModelCheckpoint(
    save_top_k=3,
    monitor="val_loss",
    mode="min",
    dirpath="./pretrained_audioset/",
    filename="audioset-{epoch:02d}-{val_loss:.2f}",
)

In [8]:
train_idx, test_idx = next(iter(ShuffleSplit(n_splits=1, test_size=0.15, random_state=22).split(range(len(ds)))))
train_ds = Subset(ds, train_idx)
test_ds = Subset(ds, test_idx)
fold_outputs = do_fold(train_ds, test_ds, 'resnet',
    trainer_params={'callbacks': [checkpoint_callback]})

NameError: name 'ds' is not defined

# Make audios into a single file

In [9]:
import librosa
from constants import laughter_data_path
import audio_utils

In [10]:
laughter_data_path

'/mnt/c/Users/Jose/gdrive/data/lared_laughter/laughter_data/ml_datasets/tight'

In [46]:
audios_path = os.path.join(laughter_data_path, 'audio')
all_audioset_files = librosa.util.find_files(audios_path, ext=['wav'])
audios = audio_utils.parallel_load_audio_batch(all_audioset_files, n_processes=8, sr=8000)

100%|██████████| 504/504 [00:04<00:00, 102.11it/s]


In [49]:
for i in range(len(audios)):
    audios[i] = librosa.resample(audios[i], orig_sr=44100, target_sr=8000)

  audios[i] = librosa.resample(audios[i], 44100, 8000)


In [50]:
h = {}
for i in range(len(all_audioset_files)):
    f = all_audioset_files[i]
    f = os.path.basename(f)[:-4]
    h[f] = audios[i]

In [51]:
with open(os.path.join(laughter_data_path, "lared_audios.pkl"), "wb") as f:
    pickle.dump(h, f)

# Extract features for the LaRed dataset

In [60]:
from utils import FeatureRecorder

In [86]:
# load the model
model = System.load_from_checkpoint('./pretrained_audioset/audioset-epoch=09-val_loss=0.46.ckpt')
recorder = FeatureRecorder()
model.model.bn2.register_forward_hook(recorder.get_hook('resnet_bigger_bn2'))

training with dropout=0.5
training with dropout=0.5


<torch.utils.hooks.RemovableHandle at 0x7f700c8ea790>

In [87]:
examples = pd.read_csv('../dataset/computational_examples.csv')
examples = examples[examples['condition'] == 'av']
audios_path = os.path.join(laughter_data_path, "lared_audios.pkl")
audios = pickle.load(open(audios_path, 'rb'))

In [88]:
examples.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,person,cam,hit_id,condition,calibration,hash,ini_time,end_time,...,gt_offset,gt_laughter,is_laughter,confidence,intensity,attempt,pressed_key,onset,offset,rating_hash
4,1058,1058,25,1,f4c9842cec7be99eeaaea36d0c7d077c4d5d94596dc731...,av,False,1170917790b51bc5a8dacacc4d8ed8c410b7ea6bb7ea4b...,7360.29,7361.54,...,4.420238,True,True,7,7,0,True,2.792656,3.893757,bf6cd2aeaf7c77c2c2ff873e6f603b7d46cd64c74e9ebd...
5,2715,2715,25,1,eecc0cf5d634ce45a98cbbda30c922f2a2cfcb1877124c...,av,False,1170917790b51bc5a8dacacc4d8ed8c410b7ea6bb7ea4b...,7360.29,7361.54,...,4.420238,True,True,4,5,0,True,2.692556,4.160691,5e161ddc0b4b35ca47cf769cf612f6d48015bb3b95763e...
10,582,582,35,3,4198c11729cea33268040a725998f16478a6564d4af091...,av,False,11bc9d8aca57ab2aef4c5305b080fa49c08665d9e94190...,2216.02,2216.54,...,3.92886,True,True,7,2,0,True,2.85939,4.227424,9a3c519923fbcec61d8195147439f87b23d34e4e777ef5...
11,2430,2430,35,3,a9760ede24043c59a0151b09a46e866fa43f74bd60b682...,av,False,11bc9d8aca57ab2aef4c5305b080fa49c08665d9e94190...,2216.02,2216.54,...,3.92886,True,True,7,3,0,True,2.759289,3.493357,f82d2bc978d5b249847debaa987922931259c56f12b1af...
16,1001,1001,1,4,f4c9842cec7be99eeaaea36d0c7d077c4d5d94596dc731...,av,False,c1d181e74dbdbce1e51d7d0bfd6e036913896dd1f22856...,3346.3,3347.7,...,3.255518,True,True,7,7,0,True,2.158689,3.193057,955ec127e6f86edf3dbb800ad217e18b1f1ec380090a4b...


In [89]:
# load the dataset
ds = SwitchBoardLaughterDataset(
    df=examples,
    audios=audios,
    feature_fn=partial(featurize_melspec, hop_length=186),
    sr=8000,
    subsample_length=1.0,
    id_column='hash',
    label_column='pressed_key')
data_loader_train = torch.utils.data.DataLoader(
        ds, batch_size=100, shuffle=False, num_workers=10,
        collate_fn=None)

df: 672, audios: 504, not found: 0
df: 672, audios: 504, not found: 0


In [90]:
model.eval()
recorder.clear()
for X, y in data_loader_train:
    y_hat = model(X)
recorder.store_as_dict('./features/resnet_bigger.pkl', dict_keys=examples.hash.to_list())

# inspect features

In [1]:
import pickle

In [2]:
features = pickle.load(open('./features/resnet_bigger.pkl', 'rb'))

In [3]:
features['resnet_bigger_bn2']['cbc382abda5165dd26a7ca9e05e7ed3000933489864cb9ed1c25fd4da2a25d19'].shape

(64,)

In [12]:
outputs, metrics = do_run(ds, 'resnet', 'binary')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type         | Params
---------------------------------------
0 | model | ResNetBigger | 221 K 
---------------------------------------
221 K     Trainable params
0         Non-trainable params
221 K     Total params
0.887     Total estimated model params size (MB)


training with dropout=0.5
training with dropout=0.5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

In [130]:
metrics

{'auc': 0.8161846056582899, 'correct': 308}

In [131]:
outputs, metrics = do_run(ds, 'alexnet', 'binary')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params
------------------------------------
0 | model | MyAlexNet | 180 K 
------------------------------------
180 K     Trainable params
0         Non-trainable params
180 K     Total params
0.724     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_auc            0.8045634920634921
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [132]:
metrics

{'auc': 0.8096451714872768, 'correct': 326}