In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import glob
import librosa
import numpy as np
from tqdm.auto import tqdm
import os
import random
import wave
import matplotlib.pyplot as plt

In [2]:
def plot_wavs_in_folder(folder_path):
    """
    Reads all WAV files in a folder, converts them to numpy arrays, and plots their waveforms.
    """
    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.wav'):  # Process only .wav files
            file_path = os.path.join(folder_path, filename)
            try:
                # Open the WAV file
                wf = wave.open(file_path, 'rb')  

                # Get audio parameters
                nframes = wf.getnframes()
                framerate = wf.getframerate()

                # Read the audio data
                audio_data = wf.readframes(nframes)
                audio_array = np.frombuffer(audio_data, dtype=np.int16)
                
                # Normalize the data
                audio_array = audio_array / (2 ** (wf.getsampwidth() * 8 - 1))

                # Time axis for plotting
                time_axis = np.arange(0, len(audio_array)) / framerate

                # Plot the waveform
                plt.figure(figsize=(10, 4))
                plt.plot(time_axis, audio_array)
                plt.xlabel("Time (seconds)")
                plt.ylabel("Amplitude")
                plt.title(f"Waveform: {filename}")
                plt.grid()
                plt.tight_layout()
                plt.show()
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

# Provide the folder path containing the .wav files
# folder_path = '../data/archers/'
# plot_wavs_in_folder(folder_path)

In [3]:
def gather_data(data_root):
    """
    Scans `data_root` directory. Each subfolder is a class label.
    Returns:
      - all_files: list of (wav_path, label_index) pairs
      - classes: list of class names (strings) in sorted order
    """
    classes = sorted(os.listdir(data_root))
    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
    
    all_files = []
    for cls_name in classes:
        cls_folder = os.path.join(data_root, cls_name)
        # Gather all WAV files in this folder
        wav_files = glob.glob(os.path.join(cls_folder, "*.wav"))
        for wav_file in wav_files:
            all_files.append((wav_file, class_to_idx[cls_name]))
    
    return all_files, classes


data_root = "../../data"  # Path to your data folder
all_files, classes = gather_data(data_root)
num_classes = len(classes)
print("Found classes:", classes)
print("Total examples:", len(all_files))


Found classes: ['_silence', 'archers', 'arrows', 'fireball', 'giant', 'knight', 'mini_pekka', 'minions', 'musketeer']
Total examples: 395


In [4]:
random.shuffle(all_files)  # Shuffle in-place

split_idx = int(0.8 * len(all_files))  # 80% for train
train_files = all_files[:split_idx]
val_files   = all_files[split_idx:]

print(f"Train size: {len(train_files)}")
print(f"Val   size: {len(val_files)}")

Train size: 316
Val   size: 79


In [5]:
class SpeechCommandsDataset(Dataset):
    def __init__(self, file_list, n_mfcc=12, sr=16000, augment=False):
        """
        file_list: list of (wav_path, label_index)
        n_mfcc: number of MFCC coefficients
        sr: sample rate to which audio is (optionally) resampled
        augment: if True, apply random augmentations to training data
        """
        self.file_list = file_list
        self.n_mfcc = n_mfcc
        self.sr = sr
        self.augment = augment

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        wav_path, label = self.file_list[idx]
        
        # Load audio
        waveform, sr = librosa.load(wav_path, sr=self.sr)

        # Trim silence
        if label != classes.index('_silence'):
            waveform, _ = librosa.effects.trim(waveform, top_db=20)


        # If augment is enabled, apply random transformations
        if self.augment and len(waveform) > 2048:
            # Random time-stretch (speed up/down by up to +/-10%)
            if random.random() < 0.5:
                rate = 1.0 + np.random.uniform(-0.1, 0.1)  # e.g., 0.9 to 1.1
                waveform = librosa.effects.time_stretch(waveform, rate=rate)

            # Random pitch shift (up/down by up to +/-2 semitones)
            if random.random() < 0.5:
                
                n_steps = np.random.uniform(-2, 2)
                waveform = librosa.effects.pitch_shift(waveform, sr=sr, n_steps=n_steps)

            # Random time shift
            # For example, shift by up to 10% of the wave length
            if random.random() < 0.5:
                shift_max = int(0.1 * len(waveform))
                shift = np.random.randint(-shift_max, shift_max)
                waveform = np.roll(waveform, shift)

            # Random background noise injection
            if random.random() < 0.5:
                noise_level = np.random.uniform(0.01, 0.02)  # Adjust range as needed
                noise = np.random.randn(len(waveform)) * noise_level
                waveform = waveform + noise

        # Now compute MFCC
        mfcc = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=self.n_mfcc, n_fft=1024)
        mfcc = mfcc.T  # shape: (time_frames, n_mfcc)

        # Convert to tensors
        mfcc_tensor = torch.tensor(mfcc, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.long)

        return mfcc_tensor, label_tensor

In [6]:
def collate_fn(batch):
    """
    batch: list of (mfcc_tensor, label_tensor) pairs
    We pad MFCC tensors on the time dimension (dim=0) so they have the same length.
    """
    mfccs = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    # Find max sequence length in this batch
    max_len = max(m.shape[0] for m in mfccs)
    n_mfcc  = mfccs[0].shape[1]  # number of MFCC coefficients

    padded_mfccs = []
    for m in mfccs:
        length = m.shape[0]
        pad_length = max_len - length
        if pad_length > 0:
            pad = torch.zeros(pad_length, n_mfcc)
            m = torch.cat([m, pad], dim=0)
        padded_mfccs.append(m)

    # Stack along batch dimension
    padded_mfccs = torch.stack(padded_mfccs, dim=0)
    labels = torch.stack(labels)

    return padded_mfccs, labels


In [7]:
batch_size = 8  # Small batch size, given few examples per class

train_dataset = SpeechCommandsDataset(train_files, n_mfcc=12, sr=16000, augment=True)
val_dataset   = SpeechCommandsDataset(val_files,   n_mfcc=12, sr=16000, augment=False)

train_loader = DataLoader(train_dataset, 
                          batch_size=batch_size, 
                          shuffle=True, 
                          collate_fn=collate_fn)

val_loader   = DataLoader(val_dataset, 
                          batch_size=batch_size, 
                          shuffle=False,
                          collate_fn=collate_fn)


In [8]:
class CNN(nn.Module):
    def __init__(self, num_classes, in_channels=1):
        super(CNN, self).__init__()
        
        # Example: a small, 3-layer CNN
        # You can increase depth/channels for better performance
        # but be mindful of overfitting if the dataset is small.
        
        self.features = nn.Sequential(
            nn.Conv2d(in_channels, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # halve time and mfcc dims

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # halve time and mfcc dims

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            # Optionally another pooling
            # nn.MaxPool2d(kernel_size=2)
        )
        
        # After the last conv, we do adaptive average pooling so that
        # we get a fixed-size 2D feature map, regardless of input time dimension.
        # Then we flatten and do a fully connected layer to classify.
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier  = nn.Linear(64, num_classes)

    def forward(self, x):
        # x is originally (batch_size, time, n_mfcc)
        # We need to add a channel dimension => (batch_size, 1, time, n_mfcc)
        x = x.unsqueeze(1)
        
        # Pass through convolutional layers
        x = self.features(x)
        
        # Global average pooling -> (batch_size, 64, 1, 1)
        x = self.global_pool(x)
        
        # Flatten -> (batch_size, 64)
        x = x.view(x.size(0), -1)
        
        # Classify
        x = self.classifier(x)
        return x

# Instantiate the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

input_size = 12      # Because we used n_mfcc=12
hidden_size = 48     # Hyperparameter - tune as needed
num_layers = 2       # Hyperparameter - tune as needed
learning_rate = 1e-3
model = CNN(num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


  return torch._C._cuda_getDeviceCount() > 0


Using device: cpu


In [9]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for mfccs, labels in tqdm(dataloader, desc="Training", leave=False):
        mfccs, labels = mfccs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(mfccs)
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * mfccs.size(0)
        
        # Compute accuracy
        _, predicted = torch.max(outputs, dim=1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
    epoch_loss = running_loss / total
    epoch_acc  = correct / total
    return epoch_loss, epoch_acc


def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for mfccs, labels in tqdm(dataloader, desc="Evaluating", leave=False):
            mfccs, labels = mfccs.to(device), labels.to(device)
            
            outputs = model(mfccs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * mfccs.size(0)
            
            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    val_loss = running_loss / total
    val_acc  = correct / total
    return val_loss, val_acc


In [26]:
num_epochs = 30
best_loss = float("inf")
best_model_weights = None
for epoch in range(num_epochs):
    print(f"\nEpoch [{epoch+1}/{num_epochs}]")
    
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    
    if val_loss < best_loss:
        best_loss = val_loss
        best_model_weights = model.state_dict()

    print(f"Train Loss: {train_loss:.4f}  |  Train Acc: {train_acc:.4f}")
    print(f"Val   Loss: {val_loss:.4f}  |  Val   Acc: {val_acc:.4f}")



Epoch [1/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5298  |  Train Acc: 0.8481
Val   Loss: 0.1746  |  Val   Acc: 0.9873

Epoch [2/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.6642  |  Train Acc: 0.7848
Val   Loss: 0.1136  |  Val   Acc: 1.0000

Epoch [3/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5626  |  Train Acc: 0.8165
Val   Loss: 0.2033  |  Val   Acc: 0.9114

Epoch [4/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5641  |  Train Acc: 0.8323
Val   Loss: 0.1487  |  Val   Acc: 0.9747

Epoch [5/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4603  |  Train Acc: 0.8354
Val   Loss: 0.1605  |  Val   Acc: 0.9620

Epoch [6/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5783  |  Train Acc: 0.8006
Val   Loss: 0.0977  |  Val   Acc: 0.9873

Epoch [7/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4947  |  Train Acc: 0.8386
Val   Loss: 0.1091  |  Val   Acc: 0.9873

Epoch [8/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4870  |  Train Acc: 0.8513
Val   Loss: 0.1389  |  Val   Acc: 0.9747

Epoch [9/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5812  |  Train Acc: 0.8354
Val   Loss: 0.1064  |  Val   Acc: 1.0000

Epoch [10/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5763  |  Train Acc: 0.8070
Val   Loss: 0.1015  |  Val   Acc: 0.9747

Epoch [11/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4877  |  Train Acc: 0.8576
Val   Loss: 0.1642  |  Val   Acc: 0.9620

Epoch [12/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5096  |  Train Acc: 0.8228
Val   Loss: 0.1687  |  Val   Acc: 0.9747

Epoch [13/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4758  |  Train Acc: 0.8576
Val   Loss: 0.1065  |  Val   Acc: 0.9620

Epoch [14/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5176  |  Train Acc: 0.8323
Val   Loss: 0.1513  |  Val   Acc: 0.9747

Epoch [15/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4639  |  Train Acc: 0.8608
Val   Loss: 0.1679  |  Val   Acc: 0.9620

Epoch [16/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4866  |  Train Acc: 0.8323
Val   Loss: 0.1821  |  Val   Acc: 0.9367

Epoch [17/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4716  |  Train Acc: 0.8576
Val   Loss: 0.1097  |  Val   Acc: 0.9494

Epoch [18/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4424  |  Train Acc: 0.8671
Val   Loss: 0.2188  |  Val   Acc: 0.9367

Epoch [19/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4883  |  Train Acc: 0.8576
Val   Loss: 0.1613  |  Val   Acc: 0.9620

Epoch [20/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5134  |  Train Acc: 0.8513
Val   Loss: 0.1511  |  Val   Acc: 0.9620

Epoch [21/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.3948  |  Train Acc: 0.8671
Val   Loss: 0.1398  |  Val   Acc: 0.9747

Epoch [22/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5120  |  Train Acc: 0.8291
Val   Loss: 0.0890  |  Val   Acc: 1.0000

Epoch [23/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4669  |  Train Acc: 0.8639
Val   Loss: 0.0791  |  Val   Acc: 0.9747

Epoch [24/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4780  |  Train Acc: 0.8323
Val   Loss: 0.1262  |  Val   Acc: 0.9747

Epoch [25/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4918  |  Train Acc: 0.8386
Val   Loss: 0.1130  |  Val   Acc: 0.9873

Epoch [26/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4454  |  Train Acc: 0.8481
Val   Loss: 0.1466  |  Val   Acc: 0.9494

Epoch [27/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.3641  |  Train Acc: 0.8829
Val   Loss: 0.1501  |  Val   Acc: 0.9494

Epoch [28/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4500  |  Train Acc: 0.8449
Val   Loss: 0.1579  |  Val   Acc: 0.9494

Epoch [29/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.4304  |  Train Acc: 0.8544
Val   Loss: 0.0767  |  Val   Acc: 0.9873

Epoch [30/30]


Training:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Train Loss: 0.5056  |  Train Acc: 0.8418
Val   Loss: 0.1341  |  Val   Acc: 0.9747


In [37]:
best_loss

0.07674016758705242

In [38]:
torch.save(best_model_weights, "cnn_model.pt")


In [10]:
def predict(model, wav_path, classes, device='cuda', sr=16000, n_mfcc=12):
    """
    Predict the class for a single .wav audio file.

    Args:
        model (nn.Module): Trained PyTorch model.
        wav_path (str): Path to the .wav file.
        classes (list): List of class names (strings), where index=label.
        device (str): 'cpu' or 'cuda' device.
        sr (int): Sample rate to which the audio is (optionally) resampled.
        n_mfcc (int): Number of MFCC features to compute.

    Returns:
        predicted_label (str): The predicted class name.
        confidence (float): Softmax confidence for the predicted class.
    """
    model.load_state_dict(torch.load("/home/linden/Desktop/projects/cr_prediction/src/model/cnn_model.pt", map_location=device))

    # Put model in eval mode
    model.eval()

    # Load and preprocess audio
    waveform, sr = librosa.load(wav_path, sr=sr)       # waveform: float32 numpy array
    mfcc = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=n_mfcc, n_fft=1024)  # shape: (n_mfcc, time_frames)
    mfcc = mfcc.T  # shape: (time_frames, n_mfcc)
    
    # Convert to Torch tensor, add batch dimension => (1, time_frames, n_mfcc)
    mfcc_tensor = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0).to(device)

    with torch.no_grad():
        # Forward pass
        outputs = model(mfcc_tensor)  # shape: (1, num_classes)
        
        # Optionally compute softmax to get probabilities
        probs = torch.softmax(outputs, dim=1)
        
        # Get predicted class index and its confidence
        predicted_idx = torch.argmax(probs, dim=1).item()
        confidence = probs[0, predicted_idx].item()

    # Map index back to class label
    predicted_label = classes[predicted_idx]
    
    return predicted_label, confidence

In [44]:
predict(model=model, wav_path='/home/linden/Desktop/projects/cr_prediction/vad_segments/6.wav', classes=classes, device='cpu')

  model.load_state_dict(torch.load("/home/linden/Desktop/projects/cr_prediction/src/model/cnn_model.pt", map_location=device))


('fireball', 0.8811231851577759)

In [14]:

def sliding_window_inference(
    model,
    audio_array,
    sr,
    classes,
    window_size=1.0,   # seconds
    hop_size=0.5,      # seconds
    n_mfcc=12,
    device="cuda"
):
    """
    Perform chunk-based inference on a long audio array.

    Args:
        model (nn.Module): Trained LSTM model that classifies short segments.
        audio_array (np.ndarray): 1D float array of audio samples.
        sr (int): Sample rate of the audio.
        classes (list[str]): List of class labels (index -> label).
        window_size (float): Duration (seconds) of each chunk.
        hop_size (float): Step size (seconds) between chunks.
        n_mfcc (int): Number of MFCC coefficients to compute per chunk.
        device (str): 'cpu' or 'cuda'.

    Returns:
        A list of tuples (start_time, end_time, predicted_label, confidence).
        - start_time and end_time are float seconds indicating the chunk.
        - predicted_label is the string label predicted.
        - confidence is the softmax probability for that label.
    """
    model.eval()
    
    # Convert window/hop from seconds to samples
    window_length = int(window_size * sr)
    hop_length = int(hop_size * sr)
    
    # We'll slide from start=0 up to len(audio_array) - window_length (inclusive)
    outputs = []
    start = 0
    
    while start + window_length <= len(audio_array):
        end = start + window_length
        chunk = audio_array[start:end]
        
        # Compute MFCC for this chunk
        mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=n_mfcc)  # (n_mfcc, time_frames)
        mfcc = mfcc.T  # (time_frames, n_mfcc)

        # Convert to torch tensor with shape (1, time_frames, n_mfcc)
        mfcc_tensor = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0).to(device)
        
        with torch.no_grad():
            logits = model(mfcc_tensor)  # shape: (1, num_classes)
            probs = torch.softmax(logits, dim=1).squeeze(0)  # (num_classes,)
            pred_idx = torch.argmax(probs).item()
            confidence = probs[pred_idx].item()
            label = classes[pred_idx]
        
        # Convert sample indices to time
        start_time_sec = start / sr
        end_time_sec = end / sr
        
        if confidence > 0.5:
          outputs.append((start_time_sec, end_time_sec, label, confidence))
        
        # Slide the window by hop_length
        start += hop_length

    return outputs


In [None]:
y, sr = librosa.load("../data/test/2.wav", sr=16000)
sliding_window_inference(model, y, sr, classes)

In [None]:
audio_path = '/home/linden/Desktop/projects/cr_prediction/test.wav'
audio, sr = librosa.load(audio_path, sr=16000)

plt.figure(figsize=(10, 4))
librosa.display.waveshow(audio, sr=sr)
plt.title('Audio Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.show()

In [None]:
audio_path = '/home/linden/Desktop/projects/cr_prediction/output.wav'
audio, sr = librosa.load(audio_path, sr=16000)

plt.figure(figsize=(10, 4))
librosa.display.waveshow(audio, sr=sr)
plt.title('Audio Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.show()