In [2]:
#Import Statements

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Conv2D, Conv1D, MaxPooling2D, MaxPooling1D, Dense, 
    Dropout, Flatten, Reshape, LSTM, Bidirectional, 
    GlobalAveragePooling1D, GlobalAveragePooling2D, 
    Input, Lambda
)
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
import os


os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Force CPU usage

# Reduce batch sizes
BATCH_SIZE = 4  # Smaller batch size for CPU training

# Check TensorFlow version
print(f"TensorFlow version: {tf.__version__}")
# Check if GPU is available
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

  from .autonotebook import tqdm as notebook_tqdm


TensorFlow version: 2.19.0
GPU available: []


## Convolutional Recurrent Neural Networks (CRNN)
A CRNN combines both convolutional layers (for feature extraction from spectrograms) and recurrent layers (to capture temporal dependencies). 

In [3]:
def build_crnn_model(input_shape, num_classes):
    model = Sequential()
    
    # CNN layers for feature extraction
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    
    # Reshape for recurrent layers
    model.add(Reshape((-1, model.output_shape[3])))
    
    # RNN layers for temporal dynamics
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(64)))
    
    # Output layer
    model.add(Dense(num_classes, activation='softmax'))
    
    return model

## Attention-Based Audio Classification
Attention mechanisms extend traditional neural networks by allowing the model to focus on the most relevant parts of the input:

In [11]:
def build_attention_audio_model(input_shape, num_classes):
    # Input layer
    inputs = Input(shape=input_shape)
    
    # CNN feature extraction
    x = Conv2D(32, (3, 3), activation='relu')(inputs)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    
    # Reshape for sequence processing
    reshape_layer = Reshape((-1, x.shape[3]))(x)
    
    # Self-attention mechanism
    query = Dense(64)(reshape_layer)
    key = Dense(64)(reshape_layer)
    value = Dense(64)(reshape_layer)
    
    # Create a custom attention layer instead of using tf.matmul directly
    class AttentionLayer(tf.keras.layers.Layer):
        def __init__(self):
            super(AttentionLayer, self).__init__()
        
        def call(self, inputs):
            query, key, value = inputs
            # Scaled dot-product attention
            attention_scores = tf.matmul(query, key, transpose_b=True)
            attention_scores = attention_scores / tf.math.sqrt(tf.cast(tf.shape(key)[-1], tf.float32))
            attention_weights = tf.nn.softmax(attention_scores, axis=-1)
            context_vector = tf.matmul(attention_weights, value)
            return context_vector
    
    # Apply the custom attention layer
    context_vector = AttentionLayer()([query, key, value])
    
    # Global pooling and classification
    x = GlobalAveragePooling1D()(context_vector)
    outputs = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

## Transfer Learning with Audio Transformers
Pre-trained audio transformers like Wav2Vec2 represent a significant extension beyond traditional models:

In [5]:
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
import torch.nn as nn

class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        # Load pre-trained Wav2Vec2 model
        self.wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base", local_files_only=False)
        # Freeze the feature extractor
        for param in self.wav2vec.feature_extractor.parameters():
            param.requires_grad = False
        # Add classification head
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, x):
        # Extract features
        with torch.no_grad():
            features = self.wav2vec(x).last_hidden_state
        # Global pooling
        pooled = features.mean(dim=1)
        # Classification
        return self.classifier(pooled)

## Siamese Networks for Speaker Verification
Siamese networks represent an extension that's particularly useful for speaker verification:

In [15]:
def build_siamese_network(input_shape):
    # Base network
    base_network = Sequential([
        Conv1D(64, 3, activation='relu', input_shape=input_shape),
        MaxPooling1D(3),
        Conv1D(128, 3, activation='relu'),
        MaxPooling1D(3),
        Conv1D(256, 3, activation='relu'),
        GlobalAveragePooling1D(),
        Dense(128, activation='relu')
    ])
    
    # Create twin networks
    input_a = Input(shape=input_shape)
    input_b = Input(shape=input_shape)
    
    # Process both inputs through same network
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    
    # Calculate distance between outputs
    # Add output_shape parameter to help Lambda layer know the shape
    distance = Lambda(
        lambda x: K.abs(x[0] - x[1]),
        output_shape=lambda x: x[0]  # Output shape is the same as first input
    )([processed_a, processed_b])
    
    # Output prediction
    prediction = Dense(1, activation='sigmoid')(distance)
    
    # Connect the inputs with the outputs
    siamese_net = Model(inputs=[input_a, input_b], outputs=prediction)
    
    return siamese_net

## Self-Supervised Contrastive Learning

Note that the data augmentation here is different than the data augmentation that happened before running the models. 

In [7]:
def contrastive_loss(y_true, y_pred, margin=1.0):
    """
    Contrastive loss function for the self-supervised model
    """
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

def data_augmentation(x):
    """
    Simple data augmentation function to create different views of the same input
    """
    # Add random noise
    x_aug = x + tf.random.normal(shape=tf.shape(x), mean=0.0, stddev=0.1)
    
    # Random time masking
    batch_size, time_steps, features = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]
    mask_length = tf.random.uniform([], 0, tf.cast(time_steps * 0.2, tf.int32), dtype=tf.int32)
    mask_start = tf.random.uniform([], 0, time_steps - mask_length, dtype=tf.int32)
    
    mask = tf.concat([
        tf.ones((batch_size, mask_start, features)),
        tf.zeros((batch_size, mask_length, features)),
        tf.ones((batch_size, time_steps - mask_start - mask_length, features))
    ], axis=1)
    
    x_aug = x_aug * mask
    return x_aug

In [8]:
class ContrastiveAudioEncoder(tf.keras.Model):
    def __init__(self):
        super().__init__()
        # Encoder architecture
        self.encoder = Sequential([
            Conv2D(32, 3, activation='relu', padding='same'),
            MaxPooling2D(),
            Conv2D(64, 3, activation='relu', padding='same'),
            MaxPooling2D(),
            Conv2D(128, 3, activation='relu', padding='same'),
            GlobalAveragePooling2D(),
            Dense(128)
        ])
        
    def call(self, x):
        return self.encoder(x)
        
    def data_augmentation(self, x):
        """Simple data augmentation for spectrograms"""
        # Add random noise
        x_aug = x + tf.random.normal(shape=tf.shape(x), mean=0.0, stddev=0.1)
        return x_aug
    
    def contrastive_loss(self, anchor, positive, temperature=0.1):
        """Compute NT-Xent loss for contrastive learning"""
        # Normalize the embeddings
        anchor = tf.math.l2_normalize(anchor, axis=1)
        positive = tf.math.l2_normalize(positive, axis=1)
        
        # Cosine similarity
        similarity = tf.matmul(anchor, positive, transpose_b=True) / temperature
        
        # Labels are just the diagonal elements (positive pairs)
        batch_size = tf.shape(anchor)[0]
        labels = tf.range(batch_size)
        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=labels, logits=similarity))
        
        return loss
        
    def train_step(self, data):
        # Unpack the data
        anchor, positive = data
        
        # Apply data augmentation to create slightly different views
        anchor_aug = self.data_augmentation(anchor)
        positive_aug = self.data_augmentation(positive)
        
        with tf.GradientTape() as tape:
            # Get the encodings
            anchor_encoding = self.encoder(anchor_aug)
            positive_encoding = self.encoder(positive_aug)
            
            # Calculate contrastive loss
            loss = self.contrastive_loss(anchor_encoding, positive_encoding)
            
        # Get gradients and update weights
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        
        return {"loss": loss}

### Running the Models

In [9]:
# Example usage of CRNN model
# Assuming your spectrograms have shape (128, 128, 1) and 2 output classes
input_shape = (128, 128, 1)  # Mel spectrogram dimensions
num_classes = 2  # Binary classification (Jeevan vs Not_Jeevan or English vs Not_English)

crnn_model = build_crnn_model(input_shape, num_classes)
crnn_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
crnn_model.summary()

# Example of how you would train (uncomment when you have data ready)
# history = crnn_model.fit(
#    train_dataset,
#    validation_data=val_dataset,
#    epochs=10
# )

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
# Example usage of Attention model
attention_model = build_attention_audio_model(input_shape, num_classes)
attention_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
attention_model.summary()

In [16]:
# Example usage of the PyTorch Wav2Vec2 model
# This requires PyTorch data preparation
import torch

# Example input dimensions
batch_size = 4
input_length = 16000  # 1 second of audio at 16kHz

# Sample random input tensor (this would be your actual audio data)
sample_input = torch.randn(batch_size, input_length)

# Initialize the model
classifier = AudioClassifier(num_classes=2)

# Forward pass
outputs = classifier(sample_input)
print(f"Output shape: {outputs.shape}")

Output shape: torch.Size([4, 2])


In [17]:
# Example usage of Siamese Network
# For audio features, typically we'd use a 1D representation
# like MFCC features with shape (time_steps, num_features)
input_shape = (128, 13)  # Example: 128 time steps with 13 MFCC features

siamese_model = build_siamese_network(input_shape)
siamese_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)
siamese_model.summary()

In [18]:
# Example usage of Contrastive Learning model
contrastive_model = ContrastiveAudioEncoder()
contrastive_model.compile(optimizer='adam')

# Example of how you would create a pair of samples (original + augmented)
# This is a simplified example - you'd need actual spectrogram data
batch_size = 4
height, width = 128, 128
example_spectrograms = tf.random.normal((batch_size, height, width, 1))

# Show that the model can process inputs
embeddings = contrastive_model(example_spectrograms)
print(f"Embedding shape: {embeddings.shape}")

Embedding shape: (4, 128)
