In [None]:
import zipfile
import pandas as pd
import torch
from nnAudio import features as spectrograms
import io
import torchaudio
import librosa
import matplotlib.pyplot as plt
import numpy as np
from pydub import AudioSegment
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Clear memory cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

train_data_path = 'data/raw/'
processed_data_path = 'data/processed_data/'

def unzip_to_buffer(zip_ref, track):
    """Open the zipfolder and save file-like object to memory."""
    waveform_binary = zip_ref.read(track)
    file_like_object = io.BytesIO(waveform_binary)

    return file_like_object

def generate_spectrograms(waveforms):
    """Generate spectrograms from a batch of waveforms.
    
    Args:
        waveforms: numpy array of shape (batch_size, samples) or torch tensor
    Returns:
        batch of spectrograms of shape (batch_size, freq_bins, time_frames)
    """
    # If input is a single waveform, add batch dimension
    if len(waveforms.shape) == 1:
        waveforms = waveforms[None, :]
        
    # Convert to tensor if not already
    if not torch.is_tensor(waveforms):
        waveforms = torch.tensor(waveforms)
    
    waveforms = waveforms.to(device)  # Move batch to device
    spec_converter = spectrograms.STFT(n_fft=2048).to(device)
    spectrograms_batch = spec_converter(waveforms)
    spectrograms_batch = spectrograms_batch.cpu().detach().numpy()
    spectrograms_db = librosa.amplitude_to_db(np.abs(spectrograms_batch), ref=np.max)
    
    return spectrograms_db[:, :, :, 0]  # Return all spectrograms, keeping batch dimension

def save_track_data(track_name, spectrograms, path, category, is_training):
    """Save track data to disk."""
    if is_training:
        save_dir = os.path.join(path, 'train', category)
    else:
        save_dir = os.path.join(path, 'test', category)
    os.makedirs(save_dir, exist_ok=True)
    
    for segment_idx, spectrogram in enumerate(spectrograms):
        audio_file_row = pd.DataFrame([{
            'file_name': f'{track_name}_segment_{segment_idx}',
            'spectrogram': spectrogram
        }])
        audio_file_row.to_pickle(os.path.join(save_dir, f'{track_name}_segment_{segment_idx}.pkl'))

def is_training_track(track_components):
    """Check if track is valid for processing."""
    if track_components[0] == "train":
        return True
    else:
        return False

def create_lossless_waveform(wav_buffer):
    """Create lossless waveform from wav buffer."""
    wav_buffer.seek(0)
    waveform = librosa.load(wav_buffer, sr=None)
    return waveform

def create_lossy_waveform(wav_buffer):
    """Create lossy waveform from wav buffer."""
    wav_buffer.seek(0)
    mp3_buffer = io.BytesIO()
    AudioSegment.from_wav(wav_buffer).export(mp3_buffer, format="mp3", bitrate="320k")
    mp3_buffer.seek(0)
    waveform = librosa.load(mp3_buffer, sr=None)
    return waveform

def split_waveform(waveform, sr, segment_duration=6):
    """Split waveform into segments of specified duration.
    Returns numpy array of shape (num_segments, samples_per_segment)"""
    samples_per_segment = sr * segment_duration
    num_segments = int(np.ceil(len(waveform) / samples_per_segment))
    segments = np.zeros((num_segments, samples_per_segment), dtype=np.float32)
    print('\rnum_segments   | ', num_segments, end='', flush=True)
    for i in range(num_segments):
        start = i * samples_per_segment
        end = min(start + samples_per_segment, len(waveform))
        segment = waveform[start:end]
        
        # Pad last segment if needed
        if len(segment) < samples_per_segment:
            padding = np.zeros(samples_per_segment - len(segment), dtype=np.float32)
            segment = np.concatenate([segment, padding])
            
        segments[i] = segment
    
    return segments

def process_track(zip_ref, track, track_name, path, is_training):
    """Process a single track to generate and save spectrograms."""
    wav_buffer = unzip_to_buffer(zip_ref, track)
    
    lossless_waveform, lossless_sr = create_lossless_waveform(wav_buffer)
    lossy_waveform, lossy_sr = create_lossy_waveform(wav_buffer)
    
    # Split waveforms into 30-second segments
    lossless_segments = split_waveform(lossless_waveform, lossless_sr)
    lossy_segments = split_waveform(lossy_waveform, lossy_sr)

    lossless_spectrograms = generate_spectrograms(lossless_segments)
    lossy_spectrograms = generate_spectrograms(lossy_segments)

    save_track_data(track_name, lossless_spectrograms, path, 'lossless', is_training)
    save_track_data(track_name, lossy_spectrograms, path, 'lossy', is_training)

def is_suitable_file(components):
    if components[-1] == 'mixture.wav':
        return True
    else:
        return False

def extract_musdb_mixture_files(raw_data_path, processed_data_path):
    """Extract and process audio files from MUSDB dataset."""
    counter = 0
    print('extracting files from zip', raw_data_path + 'musdb18hq.zip')
    with zipfile.ZipFile(raw_data_path + 'musdb18hq.zip', 'r') as zip_ref:
        for track in zip_ref.namelist():
            components = track.split('/')
                
            if is_suitable_file(components):
                is_training = is_training_track(components)
                track_name = components[1]
                print('Is training = ', is_training, '|  trackname = ', track_name)
                
                process_track(zip_ref, track, track_name, processed_data_path, is_training)
            
                counter += 1
                print(counter, 'files processed')
                print('----------------------------------------')
    print('All audio files processed. Saving to disk')
    
extract_musdb_mixture_files(train_data_path, processed_data_path)


In [1]:
# Import required libraries
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
import pandas as pd
import multiprocessing
train_data_path = 'data/processed_data/train'
test_data_path = 'data/processed_data/test'

# Custom dataset class for spectrograms
class SpectrogramDataset(Dataset):
    def __init__(self, path, target_size=(512, 512)):
        self.path = path
        self.samples = []
        self.target_size = target_size

        # Load all spectrogram files and their labels
        for catergory in ['lossy', 'lossless']:
            catergory_path = os.path.join(path, catergory)
            if not os.path.exists(catergory_path):
                raise RuntimeError(f"Directory not found: {catergory_path}")
            for file in os.listdir(catergory_path):
                if file.endswith('.pkl'):
                    self.samples.append({
                        'path': os.path.join(catergory_path, file),
                        'label': 0 if catergory == 'lossless' else 1
                    })
    
    def __len__(self):
        return len(self.samples)
    
    def resize_spectrogram(self, spec):
        # print('\rresizing spectrogram from ', spec.shape, 'to ', self.target_size, end='', flush=True)
        
        # Add batch and channel dimensions before resizing
        spec = spec.unsqueeze(0).unsqueeze(0)  # Shape becomes (1, 1, H, W)
        
        resize_transform = transforms.Compose([
            transforms.Resize(self.target_size)
        ])
        
        spec = resize_transform(spec)
        spec = spec.squeeze(0).squeeze(0)  # Remove the batch and channel dimensions
        return spec

    
    def __getitem__(self, idx):
        try:
            sample = self.samples[idx]
            df = pd.read_pickle(sample['path'])
            
            spectrogram = df['spectrogram'].iloc[0]
            # print(f"\nLoading sample {idx}")
            # print(f"Raw spectrogram shape: {spectrogram.shape}")
            
            # Normalize spectrogram
            spectrogram = (spectrogram - spectrogram.mean()) / (spectrogram.std() + 1e-6)
            
            # Convert to tensor and pad/crop
            spectrogram = torch.FloatTensor(spectrogram)
            spectrogram = self.resize_spectrogram(spectrogram)
            spectrogram = spectrogram.unsqueeze(0)  # Add channel dimension
            # print(f"Final tensor shape (with channel dimension): {spectrogram.shape}")
            
            return spectrogram, sample['label']
        except Exception as e:
            print(f"Error loading sample {idx}: {str(e)}")
            raise

# Define the CNN model
class AudioClassifier(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(AudioClassifier, self).__init__()
        
        self.conv_layers = nn.Sequential(
            # First conv block
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(dropout_rate),
            
            # Second conv block
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(dropout_rate),
            
            # Third conv block
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(dropout_rate),
            
            # Fourth conv block
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(dropout_rate),
            
            nn.AdaptiveAvgPool2d((1, 1))
        )
        
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 2)
        )
        
    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x
    
# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AudioClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Create datasets and dataloaders
train_dataset = SpectrogramDataset(train_data_path)
test_dataset = SpectrogramDataset(test_data_path)


train_loader = DataLoader(
    train_dataset, 
    batch_size=16,  # If memory-constrained
    shuffle=True,
    # num_workers = 1,  # Get number of CPU cores
    pin_memory=True,  # Faster GPU transfer
)
test_loader = DataLoader(
    test_dataset,
    batch_size=16,  
    shuffle=False,
    # num_workers = multiprocessing.cpu_count(),
    pin_memory=True,  
)

In [2]:
# Training loop
max_epochs = 100  # Set a maximum number of epochs
patience = 3  # Number of epochs to wait for improvement
best_accuracy = 0
epochs_without_improvement = 0
epoch = 0

while epochs_without_improvement < patience and epoch < max_epochs:
    print('epoch', epoch)
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for i, (spectrograms, labels) in enumerate(train_loader):
        print(f'\rProcessing batch {i+1}/{len(train_loader)}', end='', flush=True)
        
        # Clear memory cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
         
        spectrograms, labels = spectrograms.to(device, non_blocking=True), labels.to(device, non_blocking=True) # non_blocking=True is allows the data to be loaded in parallel
        
        optimizer.zero_grad(set_to_none=True) # uses None instead of 0 to reduce memory usage
        outputs = model(spectrograms)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
                
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
                
        # Explicitly clear some variables
        del outputs
        del loss
    
    print('')  # New line after batch processing is complete
    epoch_loss = running_loss / len(train_loader)
    accuracy = 100 * correct / total
    print(f'Epoch [{epoch+1}], Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.2f}%')
    
    # Check if accuracy improved
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1
        
    print(f'Best accuracy so far: {best_accuracy:.2f}%')
    print(f'Epochs without improvement: {epochs_without_improvement}')

    # Create models directory if it doesn't exist
    import os
    if not os.path.exists('models'):
        os.makedirs('models')

    # Save the model checkpoint
    checkpoint_path = f'models/model_checkpoint_epoch_{epoch+1}.pth'
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': epoch_loss,
        'accuracy': accuracy
    }, checkpoint_path)
    print(f'Checkpoint saved to {checkpoint_path}')
    
    epoch += 1

print(f'\nTraining stopped after {epoch} epochs')
if epochs_without_improvement >= patience:
    print(f'Early stopping triggered: no improvement for {patience} consecutive epochs')
elif epoch >= max_epochs:
    print('Maximum number of epochs reached')


In [None]:
# Load the latest checkpoint
checkpoint_files = [f for f in os.listdir('models') if f.startswith('model_checkpoint_epoch_')]
if checkpoint_files:
    # Get the latest checkpoint file
    latest_checkpoint = max(checkpoint_files, key=lambda x: int(x.split('_')[-1].split('.')[0]))
    checkpoint_path = os.path.join('models', latest_checkpoint)
    
    # Load checkpoint
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    print(f'Loaded checkpoint from {checkpoint_path}')
    print(f'Epoch: {start_epoch}, Loss: {checkpoint["loss"]:.4f}, Accuracy: {checkpoint["accuracy"]:.2f}%')
else:
    print('No checkpoint found. Starting from scratch.')

# Evaluate the model on test data
model.eval()  # Set the model to evaluation mode
test_loss = 0.0
all_predictions = []
all_labels = []

print("\nEvaluating model on test data...")
with torch.no_grad():  # Disable gradient calculation for inference
    for spectrograms, labels in test_loader:
        # Move data to device
        spectrograms, labels = spectrograms.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(spectrograms)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        
        # Get predictions
        _, predicted = torch.max(outputs.data, 1)
        
        # Store predictions and labels
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        
        # Clean up memory
        del outputs
        del loss

# Convert lists to numpy arrays
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

# Calculate metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

test_accuracy = accuracy_score(all_labels, all_predictions) * 100
test_recall = recall_score(all_labels, all_predictions, average='macro') * 100
test_precision = precision_score(all_labels, all_predictions, average='macro') * 100
test_f1 = f1_score(all_labels, all_predictions, average='macro') * 100
test_loss = test_loss / len(test_loader)

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.2f}%')
print(f'Test Recall: {test_recall:.2f}%')
print(f'Test Precision: {test_precision:.2f}%')
print(f'Test F1 Score: {test_f1:.2f}%')

# Set model back to training mode
model.train()




In [1]:
# Create a FastAPI endpoint for model inference
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
import base64
import io
import numpy as np
from PIL import Image
import torch
from neural_networks.cnn import AudioClassifier

app = FastAPI()

# Load the trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioClassifier()
model.load_state_dict(torch.load('models/audio_classifier.pth'))
model.to(device)
model.eval()

class SpectrogramData(BaseModel):
    spectrogram: str  # Base64 encoded spectrogram image

@app.post("/predict")
async def predict(data: SpectrogramData):
    try:
        # Decode base64 string to image
        spectrogram_bytes = base64.b64decode(data.spectrogram)
        spectrogram_img = Image.open(io.BytesIO(spectrogram_bytes))
        
        # Convert to numpy array and preprocess
        spectrogram_array = np.array(spectrogram_img)
        
        # Convert to tensor and add batch & channel dimensions
        spectrogram_tensor = torch.FloatTensor(spectrogram_array).unsqueeze(0).unsqueeze(0)
        
        # Move to device
        spectrogram_tensor = spectrogram_tensor.to(device)
        
        # Get prediction
        with torch.no_grad():
            output = model(spectrogram_tensor)
            _, predicted = torch.max(output.data, 1)
            prediction = predicted.item()
        
        # Convert prediction to label
        label = "abnormal" if prediction == 1 else "normal"
        
        return {
            "prediction": label,
            "confidence": float(torch.softmax(output, dim=1)[0][int(prediction)].item())
        }
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# Run the API server
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)


RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
# Plot all 22 samples from the dataset
import matplotlib.pyplot as plt

# Calculate rows and columns for subplot grid
n_cols = 5
n_rows = (22 + n_cols - 1) // n_cols  # Ceil division to fit all 22 samples

plt.figure(figsize=(20, 20))

# Create new DataLoaders with batch_size=1 specifically for plotting
plot_train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)
plot_test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Plot all samples
sample_count = 0
for dataloader, title in [(plot_train_loader, "Train"), (plot_test_loader, "Test")]:
    for spectrograms, _ in dataloader:
        plt.subplot(n_rows, n_cols, sample_count + 1)
        plt.imshow(spectrograms[0][0].cpu().numpy(), aspect='auto', origin='lower')
        plt.title(f'{title} Sample {sample_count + 1}')
        plt.colorbar()
        sample_count += 1
        if sample_count >= 22:  # Stop after 22 samples
            break
    if sample_count >= 22:
        break

plt.tight_layout()
plt.show()

# Print the total number of samples available
print(f"Total training samples: {len(train_dataset)}")
print(f"Total test samples: {len(test_dataset)}")