In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import signal
from scipy.io import wavfile

from pydub import AudioSegment
import random
import csv
import torchaudio
from tqdm import tqdm



In [2]:
waveform, sample_rate = torchaudio.load("/data/khood/GitHub/MLAudio/dataset/background/background_0a0a8446-5d0e-4e4d-9f2b-daf3d64ab5e8.wav")
mono_waveform = waveform.mean(dim=0)  # Convert stereo to mono
specgram_transform = torchaudio.transforms.Spectrogram()(mono_waveform)

RuntimeError: Couldn't find appropriate backend to handle uri /data/khood/GitHub/MLAudio/dataset/background/background_0a0a8446-5d0e-4e4d-9f2b-daf3d64ab5e8.wav and format None.

In [None]:
sample_rate2, samples = wavfile.read("/data/khood/GitHub/MLAudio/dataset/background/background_0a0a8446-5d0e-4e4d-9f2b-daf3d64ab5e8.wav")

In [None]:
waveform, sample_rate = torchaudio.load("/data/khood/GitHub/MLAudio/dataset/background/background_0a0a8446-5d0e-4e4d-9f2b-daf3d64ab5e8.wav", normalize=True)
mono_waveform = waveform.mean(dim=0)  # Convert stereo to mono
specgram_transform = torchaudio.transforms.Spectrogram(pad=10)(mono_waveform)

In [None]:
sample_rate

96000

In [None]:
specgram_transform

tensor([[4.8142e-01, 4.9903e-01, 2.9857e-01,  ..., 2.5468e+02, 2.1162e+02,
         1.8403e+01],
        [1.0262e-01, 2.3846e-01, 2.1369e-01,  ..., 8.1516e+01, 5.9499e+01,
         1.8949e+00],
        [1.1044e-02, 1.0026e-01, 9.9514e-02,  ..., 2.9945e+00, 8.1965e-01,
         5.9924e+00],
        ...,
        [2.9327e-05, 1.7324e-08, 1.3643e-08,  ..., 4.2028e-07, 5.6712e-08,
         2.0261e-06],
        [3.1501e-05, 4.1418e-11, 1.3649e-10,  ..., 3.2212e-07, 1.7897e-07,
         2.3794e-06],
        [3.2280e-05, 9.7363e-10, 8.4259e-10,  ..., 9.8445e-08, 5.0479e-09,
         1.7158e-06]])

In [None]:
sample_rate

96000

In [None]:
sample_rate2

96000

In [None]:
waveform[:, 1]

tensor([-0.0079, -0.0003])

In [None]:
samples[0]

array([-19464192,  -5046272], dtype=int32)

In [None]:
#define the device to use
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
class audioDataloader(Dataset):
    def __init__(
        self,
        index_file: str, 
        header=None,
        transforms=None
    ):
        indexFile = np.array(pd.read_csv(index_file,header=header))
        self.audioFiles = indexFile[:, 0]
        self.audioLabels = indexFile[:, 1]
        self.transforms = transforms
    # frequencies   (Array of sample frequencies):
    #               This represents the array of sample frequencies, i.e., the frequencies at which the spectrogram is calculated.
    #               It corresponds to the y-axis of the spectrogram plot, indicating the different frequency bins.

    # times (Array of segment times):
    #               This represents the array of segment times, i.e., the time points at which each segment of the spectrogram is calculated.
    #               It corresponds to the x-axis of the spectrogram plot, indicating different time points or segments of the signal.

    # spectrogram_data (Spectrogram of x):
    #               This is the actual spectrogram data, representing the magnitude squared of the signal's frequency content at different time segments.
    #               It is a 2D array where the rows correspond to frequency bins (f) and the columns correspond to time segments (t).
    #               The intensity of each element in Sxx represents the magnitude of the frequency component at the corresponding frequency and time.
    def __getitem__(self, index):
        filename = self.audioFiles[index]
        if not os.path.exists(filename):
            raise FileNotFoundError(f"File not found: {filename}")
        
        waveform, sample_rate = torchaudio.load(filename, normalize=True)
        mono_waveform = waveform.mean(dim=0)  # Convert stereo to mono
        specgram_transform = torchaudio.transforms.Spectrogram(pad=10)(mono_waveform)
        # Apply additional transforms if provided
        if self.transforms is not None:
            mono_waveform = self.transforms(mono_waveform)
        
        label_tensor = torch.tensor([self.audioLabels[index]], dtype=torch.float32)
        return (specgram_transform, label_tensor)

    def getFilePath(self, index):
        return self.audioFiles[index]

    def __len__(self):
        return len(self.audioLabels)

In [None]:
data = audioDataloader(index_file="/data/khood/GitHub/MLAudio/dataset/index.csv")

In [None]:
len(data)

60000

In [None]:
data[0]

(tensor([[2.2669e+03, 4.4169e+03, 3.5351e+03,  ..., 2.3675e+01, 5.5956e+01,
          6.0158e+02],
         [3.3614e+02, 1.2306e+03, 9.9287e+02,  ..., 9.7082e+01, 1.4314e+02,
          2.2710e+02],
         [7.2397e+01, 3.5013e+00, 2.9276e+00,  ..., 1.5340e+02, 8.7110e+01,
          2.0501e+00],
         ...,
         [3.9000e-02, 4.2781e-07, 2.3177e-07,  ..., 8.2779e-06, 3.9366e-06,
          2.1210e-02],
         [4.1856e-02, 1.5923e-07, 7.9586e-08,  ..., 2.1311e-06, 5.9349e-07,
          2.2822e-02],
         [4.2859e-02, 7.0431e-07, 4.7279e-08,  ..., 1.8417e-07, 5.7157e-07,
          2.3373e-02]]),
 tensor([0.]))

In [None]:
data[101]

(tensor([[7.8705e-01, 1.5253e+02, 1.2254e-02,  ..., 1.6294e+00, 1.0889e+00,
          1.0590e-01],
         [4.1139e+01, 6.0106e+01, 2.8725e+00,  ..., 1.1671e+00, 3.8079e-01,
          6.2344e-03],
         [3.3439e+02, 8.2917e+01, 4.3786e+01,  ..., 7.5508e-01, 2.5646e-02,
          3.1965e-02],
         ...,
         [2.7232e-02, 1.7836e-05, 1.0976e-05,  ..., 4.1920e-08, 1.7745e-09,
          1.1272e-03],
         [2.7304e-02, 2.1516e-07, 2.5726e-08,  ..., 1.2690e-08, 1.6668e-09,
          1.1888e-03],
         [2.7433e-02, 5.9672e-09, 2.1349e-08,  ..., 7.7819e-11, 2.1615e-11,
          1.2111e-03]]),
 tensor([0.]))

In [None]:
data[0][0].shape

torch.Size([201, 28801])

In [None]:
data[1][0].shape

torch.Size([201, 28801])

In [None]:
data[2][0].shape

torch.Size([201, 28801])

In [None]:
data[3][0].shape

torch.Size([201, 28801])

In [None]:
s = [data[0][0].shape]
f = [data.getFilePath(1)]
for i in tqdm(range(len(data))):
    if data[i][0].shape not in s:
        s.append(data[i][0].shape)
        f.append(data.getFilePath(i))
    break
print(s)
print(f)

  0%|          | 0/60000 [00:00<?, ?it/s]

[torch.Size([201, 28801])]
['/data/khood/GitHub/MLAudio/dataset/background/background_d14f0a00-b86b-49bd-85aa-e5d29649d04d.wav']





In [None]:
batch_size = 100
dataset = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True)

In [None]:
for i, j in enumerate(dataset):
    data2, labels = j
    print(labels.shape)
    break

torch.Size([100, 1])


In [None]:
model = nn.Sequential(
          nn.Linear(5789001, 100),
          nn.ReLU(),
          nn.Linear(100,1),
          nn.Sigmoid()
        )

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
model = model.to(device)

In [None]:
next(model.parameters()).is_cuda

True

In [None]:
loss = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [None]:
last_loss = 0.
number_of_epoch = 25

In [None]:
losses = []
for epoch in (pbar := tqdm(range(number_of_epoch))):
    for i, audioSample in enumerate(dataset):
        running_loss = 0.
        inputs, labels = audioSample
        inputs = torch.flatten(inputs, start_dim=1).to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        l = loss(outputs, labels)
        l.backward()
        optimizer.step()

        running_loss += l.item()
        
        pbar.set_description(f"current loss {running_loss / batch_size}")
    losses.append(running_loss / batch_size)

current loss 0.46:   0%|          | 0/25 [1:13:42<?, ?it/s]              


KeyboardInterrupt: 