In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llmlnknk/train modified/farend_scaled2.wav
/kaggle/input/llmlnknk/train modified/farend_scaled7.wav
/kaggle/input/llmlnknk/train modified/farend_scaled18.wav
/kaggle/input/llmlnknk/train modified/farend_scaled3.wav
/kaggle/input/llmlnknk/train modified/farend_scaled6.wav
/kaggle/input/llmlnknk/train modified/farend_scaled23.wav
/kaggle/input/llmlnknk/train modified/farend_scaled5.wav
/kaggle/input/llmlnknk/train modified/farend_scaled17.wav
/kaggle/input/llmlnknk/train modified/farend_scaled11.wav
/kaggle/input/llmlnknk/train modified/farend_scaled8.wav
/kaggle/input/llmlnknk/train modified/farend_scaled4.wav
/kaggle/input/llmlnknk/train modified/farend_scaled16.wav
/kaggle/input/llmlnknk/train modified/farend_scaled25.wav
/kaggle/input/llmlnknk/train modified/farend_scaled13.wav
/kaggle/input/llmlnknk/train modified/farend_scaled24.wav
/kaggle/input/llmlnknk/train modified/farend_scaled22.wav
/kaggle/input/llmlnknk/train modified/farend_scaled10.wav
/kaggle/input/llmlnkn

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import torch.nn.functional as F

# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Custom dataset
class InMemoryDataset(Dataset):
    def __init__(self, farend_speech, nearend_mic_signal, nearend_speech):
        self.farend_speech = farend_speech
        self.nearend_mic_signal = nearend_mic_signal
        self.nearend_speech = nearend_speech

    def __len__(self):
        return len(self.farend_speech)

    def __getitem__(self, idx):
        return (self.nearend_mic_signal[idx], self.nearend_speech[idx])

# Squeeze-and-Excitation (SE) Block
class SEBlock(nn.Module):
    def __init__(self, channels, reduction=8):  # Adjusted reduction for stronger attention
        super(SEBlock, self).__init__()
        self.fc1 = nn.Linear(channels, channels // reduction, bias=False)
        self.fc2 = nn.Linear(channels // reduction, channels, bias=False)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        batch_size, channels, _, _ = x.size()
        y = F.adaptive_avg_pool2d(x, 1).view(batch_size, channels)

        y = self.fc1(y)
        y = self.relu(y)
        y = self.fc2(y)
        y = self.sigmoid(y)

        y = y.view(batch_size, channels, 1, 1)
        return x * y.expand_as(x)

# Enhanced Model with SE, Multihead Attention, and LSTM
class EnhancedAttentionLSTMModel(nn.Module):
    def __init__(self):
        super(EnhancedAttentionLSTMModel, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)  # First convolutional layer
        self.se1 = SEBlock(16)  # SE Block after conv1

        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)  # Second convolutional layer
        self.se2 = SEBlock(32)  # SE Block after conv2
        
        self.pool = nn.MaxPool2d(2, 2)  # Max pooling layer
        
        # Increase the number of LSTM layers and hidden units
        self.lstm = nn.LSTM(input_size=32 * 16 * 16, hidden_size=256, num_layers=3, batch_first=True, bidirectional=True)
        
        # Multihead Attention layer
        self.multihead_attn = nn.MultiheadAttention(embed_dim=256 * 2, num_heads=4)  # Bidirectional LSTM has hidden_size*2
        
        # Fully connected layers
        self.fc1 = nn.Linear(256 * 2, 128)  # LSTM output to FC1
        self.fc2 = nn.Linear(128, 1)  # Output layer
        self.dropout = nn.Dropout(0.3)  # Dropout for regularization

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # Apply conv1 and pool
        x = self.se1(x)  # Apply SE Block after conv1
        
        x = self.pool(F.relu(self.conv2(x)))  # Apply conv2 and pool
        x = self.se2(x)  # Apply SE Block after conv2

        # Flatten the output and prepare for LSTM
        batch_size = x.size(0)
        x = x.view(batch_size, -1)  # Flatten the feature maps

        # Prepare input for LSTM by adding sequence dimension
        x = x.unsqueeze(1)  # Shape becomes (batch_size, 1, features)

        # LSTM layer
        lstm_out, _ = self.lstm(x)  # LSTM output
        
        # Multihead Attention: Apply attention mechanism to focus on important timesteps
        attn_output, _ = self.multihead_attn(lstm_out, lstm_out, lstm_out)  # Self-attention

        # Apply fully connected layers with dropout
        x = self.dropout(F.relu(self.fc1(attn_output[:, 0, :])))  # Use only the first token for final classification
        
        return self.fc2(x)  # Output layer

# Function to calculate power
def calculate_power(signal):
    return torch.mean(signal ** 2)

# Normalize the dataset
def normalize(data):
    mean = np.mean(data)
    std = np.std(data)
    return (data - mean) / std

# Sample data for demonstration purposes
num_samples = 1000  # Adjust as necessary
farend_speech = normalize(np.random.rand(num_samples, 64, 64).astype(np.float32))
nearend_mic_signal = normalize(np.random.rand(num_samples, 64, 64).astype(np.float32))
nearend_speech = normalize(np.random.rand(num_samples, 1).astype(np.float32))

# Create dataset and dataloader
dataset = InMemoryDataset(farend_speech, nearend_mic_signal, nearend_speech)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)  # Smaller batch size for better convergence

# Instantiate model, define loss and optimizer
model = EnhancedAttentionLSTMModel().to(device)
criterion = nn.MSELoss()  # Use appropriate loss function
optimizer = optim.Adam(model.parameters(), lr=0.00005)  # Adjusted learning rate for better performance

# Training loop
num_epochs = 200  # Increase the number of epochs for better convergence
total_power_input = 0
total_power_output = 0
total_samples = 0

for epoch in range(num_epochs):
    model.train()
    
    for nearend_mic_mag, nearend_speech_mag in train_loader:
        nearend_mic_mag = nearend_mic_mag.to(device).unsqueeze(1)  # Add channel dimension
        nearend_speech_mag = nearend_speech_mag.to(device).unsqueeze(1)  # Shape should be (batch_size, 1)

        optimizer.zero_grad()
        output = model(nearend_mic_mag)

        # Calculate power for the batch
        power_input = calculate_power(nearend_mic_mag)
        power_output = calculate_power(output)

        # Accumulate powers
        total_power_input += power_input.item() * nearend_mic_mag.size(0)  # Scale by batch size
        total_power_output += power_output.item() * nearend_mic_mag.size(0)  # Scale by batch size
        total_samples += nearend_mic_mag.size(0)

        # Backpropagation
        loss = criterion(output, nearend_speech_mag)
        loss.backward()
        optimizer.step()

# After processing all batches for all epochs, compute the average power for the entire training
average_power_input = total_power_input / total_samples
average_power_output = total_power_output / total_samples

# Avoid log(0) by adding a small value to power_output
average_power_output += 1e-10  

# Convert to tensors before calculating ERLE
average_power_input_tensor = torch.tensor(average_power_input, device=device)
average_power_output_tensor = torch.tensor(average_power_output, device=device)

# Calculate combined ERLE for the entire dataset
combined_erle = 10 * torch.log10(average_power_input_tensor / average_power_output_tensor)

# Print the combined ERLE value
print(f'Combined ERLE: {combined_erle.item():.4f} dB')


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Combined ERLE: 45.0247 dB
