<a href="https://colab.research.google.com/github/harryfrzz/nanoEmbed/blob/main/nanoEmbed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision torchaudio
!pip install tranformers
!pip install open-clip-torch
!pip install pillow
!pip install timm
!pip install librosa

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import open_clip
import librosa
from transformers import ASTFeatureExtractor, ASTModel
from timm.utils import reparameterize_model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB\n")

KeyboardInterrupt: 

In [None]:
class ProjectionLayer(nn.Module):
    def __init__(self, ast_dim=768, clip_dim=768):
        super().__init__()
        self.projection = nn.Sequential(
            nn.Linear(ast_dim, clip_dim),
            nn.LayerNorm(clip_dim),
            nn.ReLU(),
            nn.Linear(clip_dim, clip_dim)
        )

    def forward(self, ast_embeddings):
        return self.projection(ast_embeddings)

NameError: name 'nn' is not defined

In [None]:
def load_models():
    ast_feature_extractor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
    ast_model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
    ast_model.eval()

    mobileclip_model, _, preprocess = open_clip.create_model_and_transforms('MobileCLIP2-S3', pretrained='dfndr2b')
    mobileclip_model.eval()
    mobileclip_model = reparameterize_model(mobileclip_model)
    tokenizer = open_clip.get_tokenizer('MobileCLIP2-S3')

    projection_layer = ProjectionLayer(ast_dim=768, clip_dim=768).to(device)

    return ast_feature_extractor, ast_model, mobileclip_model, tokenizer, projection_layer

In [None]:
def get_ast_embeddings(audio_path, feature_extractor, model):
  array, sampling_rate = librosa.load("/content/meow.wav", sr=16000)
  inputs = feature_extractor(array, sampling_rate=sampling_rate, return_tensors="pt")

  model.eval()
  with torch.no_grad():
      outputs = model(**inputs)
      cls_embedding = outputs.last_hidden_state[:, 0, :]
  return cls_embedding

In [None]:
def get_text_embeddings(text_list, tokenizer, model):
  text = tokenizer(text_list)
  with torch.no_grad():
      text_features = model.encode_text(text)
  return text_features

In [None]:
def prepare_batch_data(audio_text_pairs, ast_feature_extractor, ast_model, tokenizer, mobileclip_model):
    audio_embeddings = []
    text_embeddings = []

    for (audio_path, text_description) in audio_text_pairs:
        ast_emb = get_ast_embeddings(audio_path, ast_feature_extractor, ast_model)
        audio_embeddings.append(ast_emb)

        text_emb = get_text_embeddings([text_description], tokenizer, mobileclip_model)
        text_embeddings.append(text_emb)

    audio_embeddings = torch.cat(audio_embeddings, dim=0)
    text_embeddings = torch.cat(text_embeddings, dim=0)
    return audio_embeddings, text_embeddings


In [None]:
def train_projection_layer(projection_layer, ast_model, ast_feature_extractor,
                          mobileclip_model, tokenizer, audio_text_pairs,
                          epochs=100, lr=1e-3, batch_size=16):
    optimizer = optim.Adam(projection_layer.parameters(), lr=lr)
    criterion = nn.CosineEmbeddingLoss()

    use_amp = torch.cuda.is_available()
    scaler = torch.amp.GradScaler('cuda') if use_amp else None

    projection_layer.train()


    test_ast = get_ast_embeddings(audio_text_pairs[0][0], ast_feature_extractor, ast_model).to(device) # Move to device
    print(f"1. AST embedding shape:           {test_ast.shape} (device: {test_ast.device})")

    test_projected = projection_layer(test_ast)
    print(f"2. Projected audio shape:         {test_projected.shape} (device: {test_projected.device})")

    test_text = get_text_embeddings([audio_text_pairs[0][1]], tokenizer, mobileclip_model).to(device) # Move to device
    print(f"3. Text embedding shape:          {test_text.shape} (device: {test_text.device})")

    test_projected_norm = test_projected / test_projected.norm(dim=-1, keepdim=True)
    test_text_norm = test_text / test_text.norm(dim=-1, keepdim=True) # Normalize on device
    print(f"4. Normalized projected audio:    {test_projected_norm.shape}")
    print(f"5. Normalized text:               {test_text_norm.shape}")

    print("=" * 60)

    if test_projected_norm.shape[-1] != test_text_norm.shape[-1]:
        print(f"\n‚ùå DIMENSION MISMATCH!")
        return projection_layer

    print("\n‚úÖ Dimensions match! Starting GPU training with batching...")
    print("=" * 60 + "\n")

    # Pre-compute all embeddings for faster training
    audio_embeddings, text_embeddings = prepare_batch_data(
        audio_text_pairs, ast_feature_extractor, ast_model, tokenizer, mobileclip_model
    )

    # Move pre-computed embeddings to the device
    audio_embeddings = audio_embeddings.to(device)
    text_embeddings = text_embeddings.to(device)


    num_samples = len(audio_embeddings)

    import time
    start_time = time.time()

    for epoch in range(epochs):
        total_loss = 0
        num_batches = 0

        # Shuffle data
        indices = torch.randperm(num_samples)

        # Process in batches
        for i in range(0, num_samples, batch_size):
            batch_indices = indices[i:i+batch_size]

            # Get batch data
            batch_audio = audio_embeddings[batch_indices]
            batch_text = text_embeddings[batch_indices]

            optimizer.zero_grad()

            if use_amp:
                with torch.amp.autocast('cuda'): # Update to torch.amp.autocast
                    # Project audio
                    projected_audio = projection_layer(batch_audio)
                    projected_audio = projected_audio / projected_audio.norm(dim=-1, keepdim=True)

                    # Normalize text
                    batch_text_norm = batch_text / batch_text.norm(dim=-1, keepdim=True)

                    # Compute loss
                    target = torch.ones(len(batch_audio), device=device)
                    loss = criterion(projected_audio, batch_text_norm, target)

                # Backward with gradient scaling
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                # Project audio
                projected_audio = projection_layer(batch_audio)
                projected_audio = projected_audio / projected_audio.norm(dim=-1, keepdim=True)

                # Normalize text
                batch_text_norm = batch_text / batch_text.norm(dim=-1, keepdim=True)

                # Compute loss
                target = torch.ones(len(batch_audio), device=device)
                loss = criterion(projected_audio, batch_text_norm, target)

                loss.backward()
                optimizer.step()

            total_loss += loss.item()
            num_batches += 1

        if (epoch + 1) % 10 == 0:
            avg_loss = total_loss / num_batches
            elapsed = time.time() - start_time
            epochs_per_sec = (epoch + 1) / elapsed
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, Speed: {epochs_per_sec:.2f} epochs/sec")

    total_time = time.time() - start_time
    projection_layer.eval()
    print(f"\n‚úÖ Training complete in {total_time:.2f} seconds!")
    return projection_layer

In [None]:
def audio_text_similarity(audio_path, text_list, ast_feature_extractor, ast_model,
                         projection_layer, mobileclip_model, tokenizer):
    ast_embedding = get_ast_embeddings(audio_path, ast_feature_extractor, ast_model).to(device)
    projected_audio = projection_layer(ast_embedding)
    projected_audio = projected_audio / projected_audio.norm(dim=-1, keepdim=True)

    text_features = get_text_embeddings(text_list, tokenizer, mobileclip_model).to(device)

    similarity = (100.0 * projected_audio @ text_features.T).softmax(dim=-1)

    return similarity

In [None]:
if __name__ == "__main__":
    # Load models
    ast_feature_extractor, ast_model, mobileclip_model, tokenizer, projection_layer = load_models()

    # Define audio-text pairs for training
    audio_text_pairs = [
        ("/content/meow.wav", "a cat meowing"),
        ("/content/bark.wav", "a dog barking"),
        ("/content/gunfire.wav","gunfire from a gun"),
        ("/content/gunfire.wav","sound of a gun firing"),
        ("/content/gunfire.wav","a gunshot"),
        ("/content/meow.wav", "sound of a cat"),
        ("/content/bark.wav", "a dog's bark"),
    ]

    # Train the projection layer
    projection_layer = train_projection_layer(
        projection_layer, ast_model, ast_feature_extractor,
        mobileclip_model, tokenizer, audio_text_pairs, epochs=100
    )

    print("\n" + "=" * 60)
    print("üß™ TESTING AUDIO-TEXT SIMILARITY")
    print("=" * 60)

    audio_path = "/content/meow.wav"
    text_options = ["a cat meowing", "a dog barking", "a bird chirping", "gun fire"]

    similarities = audio_text_similarity(
        audio_path, text_options, ast_feature_extractor, ast_model,
        projection_layer, mobileclip_model, tokenizer
    )

    print("\nAudio-Text Similarities:")
    print("-" * 60)
    for text, prob in zip(text_options, similarities[0]):
        print(f"  {text:30s} : {prob.item():.4f}")
    print("=" * 60)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

open_clip_model.safetensors:   0%|          | 0.00/996M [00:00<?, ?B/s]

1. AST embedding shape:           torch.Size([1, 768]) (device: cuda:0)
2. Projected audio shape:         torch.Size([1, 768]) (device: cuda:0)
3. Text embedding shape:          torch.Size([1, 768]) (device: cuda:0)
4. Normalized projected audio:    torch.Size([1, 768])
5. Normalized text:               torch.Size([1, 768])

‚úÖ Dimensions match! Starting GPU training with batching...

üì¶ Pre-computing embeddings for batch training...
Epoch [10/100], Loss: 0.1951, Speed: 8.84 epochs/sec
Epoch [20/100], Loss: 0.1789, Speed: 17.01 epochs/sec
Epoch [30/100], Loss: 0.1752, Speed: 24.04 epochs/sec
Epoch [40/100], Loss: 0.1740, Speed: 30.47 epochs/sec
Epoch [50/100], Loss: 0.1737, Speed: 35.49 epochs/sec
Epoch [60/100], Loss: 0.1735, Speed: 39.48 epochs/sec
Epoch [70/100], Loss: 0.1735, Speed: 44.48 epochs/sec
Epoch [80/100], Loss: 0.1735, Speed: 48.13 epochs/sec
Epoch [90/100], Loss: 0.1735, Speed: 52.52 epochs/sec
Epoch [100/100], Loss: 0.1735, Speed: 56.62 epochs/sec

‚úÖ Training compl