In [3]:
# Convenient for importing modules from the parent directory
import sys
sys.path.append("../../")

In [5]:
import os
import torch
import torchaudio
import numpy as np
import src.models as models
from src import dataloader
from src.utilities.stats import calculate_stats
from IPython.display import Audio, display
import csv
import warnings

class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Arguments about the data
data_args = Namespace(
    num_mel_bins = 128,
    target_length = 1024,
    mean = -5.0767093,
    std = 4.4533687,
)

# Arguments about the model
model_args = Namespace(
    model_type = 'base',
    n_classes = 309,
    imagenet_pretrain = False,
    imagenet_pretrain_path = None,
    aum_pretrain = True,
    aum_pretrain_path = 'models/aum-base_audioset-vggsound.pth',
    aum_variant = 'Fo-Bi',
    device = 'cuda',
)

In [8]:
# Initilize the model

# Embedding dimension
if 'base' in model_args.model_type:
    embed_dim = 768
elif 'small' in model_args.model_type:
    embed_dim = 384
elif 'tiny' in model_args.model_type:
    embed_dim = 192

# AuM block type
bimamba_type = {
    'Fo-Fo': 'none', 
    'Fo-Bi': 'v1', 
    'Bi-Bi': 'v2'
}.get(
    model_args.aum_variant, 
    None
)

AuM = models.AudioMamba(
    spectrogram_size=(data_args.num_mel_bins, data_args.target_length),
    patch_size=(16, 16),
    strides=(16, 16),
    embed_dim=embed_dim,
    num_classes=model_args.n_classes,
    imagenet_pretrain=model_args.imagenet_pretrain,
    imagenet_pretrain_path=model_args.imagenet_pretrain_path,
    aum_pretrain=model_args.aum_pretrain,
    aum_pretrain_path=model_args.aum_pretrain_path,
    bimamba_type=bimamba_type,
)

<All keys matched successfully>
Resize function is resample_patch_embed
Initializing FlexiPatchEmbed with the following parameters:
patch_size=(16, 16), in_chans=1, embed_dim=768, bias=True, norm_layer=None, flatten=True, proj_load=yes, resize_func=resample_patch_embed
The resize function is resample_patch_embed
Loading projection weights!
The shapes of the current projection: bias=torch.Size([768]), weight=torch.Size([768, 1, 16, 16])
The shapes of the loaded projection: bias=torch.Size([768]), weight=torch.Size([768, 1, 16, 16])
Initializing FlexiPosEmbed with the following parameters:
input_size=(128, 1024), pos_grid_size=(8, 64), embed_dim=768, pos_embed_load=torch.Size([1, 513, 768]), pos_grid_size_load=(8, 64), n_prefix_tokens=1, pos_embed_prefix=True
Loading position embedding!
The shape of the current grid size: (8, 64)
The shape of the loaded grid size: (8, 64)


In [9]:
model = AuM.from_pretrained("Robzy/audiomamba")

<All keys matched successfully>
Resize function is resample_patch_embed
Initializing FlexiPatchEmbed with the following parameters:
patch_size=[16, 16], in_chans=1, embed_dim=768, bias=True, norm_layer=None, flatten=True, proj_load=yes, resize_func=resample_patch_embed
The resize function is resample_patch_embed
Loading projection weights!
The shapes of the current projection: bias=torch.Size([768]), weight=torch.Size([768, 1, 16, 16])
The shapes of the loaded projection: bias=torch.Size([768]), weight=torch.Size([768, 1, 16, 16])
Initializing FlexiPosEmbed with the following parameters:
input_size=[128, 1024], pos_grid_size=(8, 64), embed_dim=768, pos_embed_load=torch.Size([1, 513, 768]), pos_grid_size_load=(8, 64), n_prefix_tokens=1, pos_embed_prefix=True
Loading position embedding!
The shape of the current grid size: (8, 64)
The shape of the loaded grid size: (8, 64)


In [15]:
from transformers import PretrainedConfig
from typing import List


class AuMConfig(PretrainedConfig):
    model_type = "mamba"

    def __init__(
        self,
        num_mel_bins: int = 128,
        target_length: int = 1024,
        mean: float = -5.0767093,
        std: float = 4.4533687,
        model_type: str = 'base',
        n_classes: int = 309,
        imagenet_pretrain: bool = False,
        imagenet_pretrain_path: bool = None,
        aum_pretrain: bool = True,
        aum_pretrain_path: str = 'models/aum-base_audioset-vggsound.pth',
        aum_variant: str = 'Fo-Bi',
        device: str = 'cuda',
        **kwargs,
    ):

        # Embedding dimension
        if 'base' in model_type:
            embed_dim = 768
        elif 'small' in model_type:
            embed_dim = 384
        elif 'tiny' in model_type:
            embed_dim = 192
        
        # AuM block type
        bimamba_type = {
            'Fo-Fo': 'none', 
            'Fo-Bi': 'v1', 
            'Bi-Bi': 'v2'
        }.get(
            aum_variant, 
            None
        )
            
        self.spectrogram_size=(num_mel_bins, target_length)
        self.patch_size=(16, 16)
        self.strides=(16, 16)
        self.embed_dim=embed_dim
        self.num_classes=n_classes
        self.imagenet_pretrain=imagenet_pretrain
        self.imagenet_pretrain_path=imagenet_pretrain_path
        self.aum_pretrain=aum_pretrain
        self.aum_pretrain_path=aum_pretrain_path
        self.bimamba_type=bimamba_type

        super().__init__(**kwargs)

In [30]:
config = AuMConfig()
config.push_to_hub(repo_id="Robzy/audiomamba")

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Robzy/audiomamba/commit/313271b51437dececf87e776dcabfbab29cb68ad', commit_message='Upload config', commit_description='', oid='313271b51437dececf87e776dcabfbab29cb68ad', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Robzy/audiomamba', endpoint='https://huggingface.co', repo_type='model', repo_id='Robzy/audiomamba'), pr_revision=None, pr_num=None)

In [33]:
from transformers import PreTrainedModel
from timm.models.resnet import BasicBlock, Bottleneck, ResNet

class AudioMambaModel(PreTrainedModel):
    config_class = AuMConfig()

    def __init__(self, config):
        super().__init__(config)
        self.model = models.AudioMamba(
            spectrogram_size = config.spectrogram_size,
            patch_size = config.patch_size,
            strides = config.strides,
            embed_dim = config.embed_dim,
            num_classes = config.num_classes,
            imagenet_pretrain = config.imagenet_pretrain,
            imagenet_pretrain_path = config.imagenet_pretrain_path,
            aum_pretrain = config.aum_pretrain,
            aum_pretrain_path = config.aum_pretrain_path,
            bimamba_type = config.bimamba_type,
        )

    def forward(self, tensor):
        return self.model(tensor)

In [34]:
model = AudioMambaModel(config)
model.push_to_hub(repo_id="Robzy/audiomamba")

<All keys matched successfully>
Resize function is resample_patch_embed
Initializing FlexiPatchEmbed with the following parameters:
patch_size=(16, 16), in_chans=1, embed_dim=768, bias=True, norm_layer=None, flatten=True, proj_load=yes, resize_func=resample_patch_embed
The resize function is resample_patch_embed
Loading projection weights!
The shapes of the current projection: bias=torch.Size([768]), weight=torch.Size([768, 1, 16, 16])
The shapes of the loaded projection: bias=torch.Size([768]), weight=torch.Size([768, 1, 16, 16])
Initializing FlexiPosEmbed with the following parameters:
input_size=(128, 1024), pos_grid_size=(8, 64), embed_dim=768, pos_embed_load=torch.Size([1, 513, 768]), pos_grid_size_load=(8, 64), n_prefix_tokens=1, pos_embed_prefix=True
Loading position embedding!
The shape of the current grid size: (8, 64)
The shape of the loaded grid size: (8, 64)


model.safetensors: 100%|██████████| 368M/368M [00:22<00:00, 16.3MB/s] 


CommitInfo(commit_url='https://huggingface.co/Robzy/audiomamba/commit/c2585a1f1da62f8ea3fba91388e490f645fd8394', commit_message='Upload model', commit_description='', oid='c2585a1f1da62f8ea3fba91388e490f645fd8394', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Robzy/audiomamba', endpoint='https://huggingface.co', repo_type='model', repo_id='Robzy/audiomamba'), pr_revision=None, pr_num=None)

In [36]:
from transformers import AutoConfig, AutoModel, AutoModelForAudioClassification

AutoConfig.register("audiomamba", AutoConfig)
AutoModel.register(AuMConfig, AutoConfig)
AutoModelForAudioClassification.register(AuMConfig, AutoModelForAudioClassification)

In [None]:
from transformers import AutoConfig, AutoModel, AutoModelForImageClassification

AutoConfig.register("resnet", ResnetConfig)
AutoModel.register(ResnetConfig, ResnetModel)
AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)