In [1]:
%load_ext autoreload
%autoreload 2

In [18]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [15]:
from CLAPWrapper import CLAPWrapper
from utils.dataset import *
from utils.activations import save_dataset_activations

import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [5]:
seed_everything(42)

In [6]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
module_activation_dict = {
    # Conv blocks
    'audio_encoder.base.conv_block1': nn.Identity(),
    'audio_encoder.base.conv_block2': nn.Identity(),
    'audio_encoder.base.conv_block3': nn.Identity(),
    'audio_encoder.base.conv_block4': nn.Identity(),
    'audio_encoder.base.conv_block5': nn.Identity(),
    'audio_encoder.base.conv_block6': nn.Identity(),
    'audio_encoder.base.fc1': F.relu,
    'audio_encoder.projection.linear1': F.gelu,
    'audio_encoder.projection.linear2': nn.Identity(),
}

In [8]:
conv = lambda i: f'audio_encoder.base.conv_block{i}'
fc = 'audio_encoder.base.fc1'
proj = lambda i: f'audio_encoder.projection.linear{i}'

module_list = [
    (conv(1), ),
    (conv(2), conv(3)),
    (conv(4), conv(5), conv(6)),
    (fc, proj(1), proj(2))
]

In [30]:
weights_path = "/scratch/pratyaksh.g/clap/CLAP_weights_2022_microsoft.pth"
clap_model = CLAPWrapper(weights_path, use_cuda=True if DEVICE == "cuda" else False)

In [31]:
clap_model.clap.eval()

CLAP(
  (audio_encoder): AudioEncoder(
    (base): Cnn14(
      (spectrogram_extractor): Spectrogram(
        (stft): STFT(
          (conv_real): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
          (conv_imag): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
        )
      )
      (logmel_extractor): LogmelFilterBank()
      (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv_block1): ConvBlock(
        (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (conv_block2): ConvBlock(
        (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
 

In [32]:
dataset = SpeechCommandsDataset(class_limit=50)

In [36]:
save_dataset_activations(dataset, clap_model, module_activation_dict, module_list)

Run 1/4:   0%|          | 0/1750 [00:00<?, ?it/s]

Run 2/4:   0%|          | 0/1750 [00:00<?, ?it/s]

Run 3/4:   0%|          | 0/1750 [00:00<?, ?it/s]