1. Import Necessary Libraries


In [4]:
import tensorflow as tf
import torch
print("TensorFlow version:", tf.__version__)
print("PyTorch version:", torch.__version__)


TensorFlow version: 2.15.0
PyTorch version: 2.5.0


In [2]:
import sys
print(sys.path)

['/Users/rrenoir/miniforge3/envs/tf-macos-env/lib/python39.zip', '/Users/rrenoir/miniforge3/envs/tf-macos-env/lib/python3.9', '/Users/rrenoir/miniforge3/envs/tf-macos-env/lib/python3.9/lib-dynload', '', '/Users/rrenoir/miniforge3/envs/tf-macos-env/lib/python3.9/site-packages', '/Users/rrenoir/miniforge3/envs/tf-macos-env/lib/python3.9/site-packages/setuptools/_vendor']


Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
import torch.nn.functional as F
import tensorflow as tf

# Verify TensorFlow and PyTorch installations
print("TensorFlow version:", tf.__version__)
print("PyTorch version:", torch.__version__)


TensorFlow version: 2.15.0
PyTorch version: 2.5.0


In [8]:
# Step 1: Data Preprocessing (convert audio to spectrogram)
class AudioDataset(Dataset):
    def __init__(self, dataset_path, max_width=400):  # You can adjust max_width
        self.dataset_path = dataset_path
        self.file_list = [os.path.join(dataset_path, file) for file in os.listdir(dataset_path) if file.endswith('.wav')]
        self.max_width = max_width

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        audio_path = self.file_list[idx]
        audio, sr = librosa.load(audio_path, sr=16000)
        # Convert audio to Mel-spectrogram
        spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        # Normalize spectrogram
        spectrogram = (spectrogram - spectrogram.min()) / (spectrogram.max() - spectrogram.min())
        
        # Check the width (time dimension) of the spectrogram
        width = spectrogram.shape[1]
        if width < self.max_width:
            # Pad if the spectrogram is smaller than max_width
            padding = self.max_width - width
            spectrogram = F.pad(torch.tensor(spectrogram, dtype=torch.float32), (0, padding), "constant", 0)
        else:
            # Truncate if the spectrogram is larger than max_width
            spectrogram = torch.tensor(spectrogram[:, :self.max_width], dtype=torch.float32)
        
        # Ensure the spectrogram has 4 dimensions: (1, channels, height, width)
        spectrogram = spectrogram.unsqueeze(0)  # Add channel dimension

        return spectrogram

# Define path to your dataset folder
dataset_path = '../dataset'

# Create dataset and dataloader
dataset = AudioDataset(dataset_path, max_width=400)  # Adjust max_width according to your needs
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [9]:
# Step 2: CNN-LSTM Model
class CNN_LSTM(nn.Module):
    def __init__(self, num_classes):
        super(CNN_LSTM, self).__init__()
        # CNN layers
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        # LSTM layer
        self.lstm = nn.LSTM(64, 128, batch_first=True)
        # Fully connected layer
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        # Input shape: (batch_size, channels, height, width)
        batch_size, channels, height, width = x.size()
        
        # CNN feature extraction
        x = self.cnn(x)  # After CNN: (batch_size, 64, height, width)
        
        # Reshape for LSTM (batch_size, width, height * channels)
        x = x.permute(0, 3, 1, 2)  # Change to (batch_size, width, channels, height)
        x = x.reshape(batch_size, x.size(1), -1)  # Flatten to (batch_size, width, features)
        
        # LSTM processing
        lstm_out, _ = self.lstm(x)
        
        # Take the output of the last time step
        final_output = lstm_out[:, -1, :]
        
        # Fully connected layer for classification
        out = self.fc(final_output)
        
        return out

In [10]:
# Step 3: Training Loop
def train_model(model, dataloader, epochs=10):
    criterion = nn.CrossEntropyLoss()  # Use appropriate loss function for classification
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for i, spectrograms in enumerate(dataloader):
            spectrograms = spectrograms.squeeze(1)  # Remove extra channel dim
            labels = torch.randint(0, num_classes, (spectrograms.size(0),))  # Dummy labels, replace with actual

            optimizer.zero_grad()
            outputs = model(spectrograms)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 10 == 9:  # Print every 10 batches
                print(f"Epoch {epoch+1}, Batch {i+1}, Loss: {running_loss / 10}")
                running_loss = 0.0

# Initialize and train the model
num_classes = 2  # Replace with actual number of classes (e.g., slow vs. fast pouring)
model = CNN_LSTM(num_classes)
train_model(model, dataloader)


ValueError: not enough values to unpack (expected 4, got 3)

Extracting features:   0%|          | 0/5638 [00:00<?, ?it/s]

Feature vectors normalized shape: (5638, 416)


AttributeError: module 'tensorflow' has no attribute 'data'