In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa as lr
import os

In [2]:
def quantization_u_law(x):
    """
    quantizing 16 bit audio inputs to 0-255 range using meu law quantization
    assuming the audio input is already normalized in range [-1, 1]
    """
    x = np.array(x)
    u = 255
    mu_law_output = np.sign(x) * (np.log(1 + u*np.abs(x)))/(np.log(1+u))
    quantized = ((mu_law_output + 1)/2 * 255).astype(np.uint8)
    return torch.tensor(quantized)

print(quantization_u_law([-0.3, 0.7, 0.5, 0.03]))

tensor([ 27, 246, 239, 177], dtype=torch.uint8)


In [None]:
def createDataset(out_file):
    processed_files = []
    for file in os.listdir(r"C:\Users\tanbi\door\hall_projects\audio_replicate\models\waveNET\Datasets"):
        audio_file = os.path.join(r"C:\Users\tanbi\door\hall_projects\audio_replicate\models\waveNET\Datasets", file)
        if os.path.isfile(audio_file):
            audio_data, _ = lr.load(audio_file, sr=16000, mono=True)
            # process data (skipping right now)
            processed_files.append(audio_data)
    np.savez(out_file, *processed_files)
# createDataset(r"C:\Users\tanbi\door\hall_projects\audio_replicate\models\waveNET\data\processed_data")


In [None]:
# TEST block
"""
from numpy import load

data = load('../data/proceed_data.npz')
lst = data.files
for item in lst:
    print(item)
    print(data[item])
"""

arr_0
[ 1.1830350e-05 -8.9085406e-06  2.2375611e-06 ...  3.8030726e-04
 -2.5518136e-03 -7.0407242e-03]
arr_1
[-0.00463775  0.0014874  -0.00155717 ...  0.00877299  0.00386699
  0.00637478]
arr_2
[1.0861059e-02 1.3355248e-02 2.1995592e-03 ... 5.5337507e-05 7.6695425e-05
 3.9345337e-05]


In [None]:
# TEST BLOCK
"""
audio_data, _ = lr.load("../Datasets/Speaker27_000.wav", sr=16000, mono=True)
quantized_audio_data = quantization_u_law(audio_data)
"""

tensor(19, dtype=torch.uint8)

In [23]:
class WaveNetDataset(Dataset):
    def __init__(self, 
                 dataset_path,
                 out_path, 
                 input_length, 
                 target_length=1, 
                 sampling_rate=16000, 
                 mono=True):
        
        self.dataset_path = dataset_path
        self.out_path = out_path
        self.sampling_rate = sampling_rate
        self.input_length = input_length
        self.target_length = target_length
        self.mono = mono
        self.sample_loc_util = [0]
        
        if not os.path.isdir(dataset_path):
            raise FileNotFoundError(f"dataset path {dataset_path} not found!!")
        self.createDataset(self.out_path)

        # out_file = f"{self.out_path}.npz" if not self.out_path.endswith(".npz") else self.out_path
        # if not os.path.exists(out_file):
        #     raise FileNotFoundError(f"Processed dataset file not found: {out_file}")
        self.data = np.load(f"{self.out_path}.npz", mmap_mode='r')


    def createDataset(self, out_file):
        processed_files = []
        for file in os.listdir(self.dataset_path):
            audio_file = os.path.join(self.dataset_path, file)
            if os.path.isfile(audio_file):
                audio_data, _ = lr.load(audio_file, sr=self.sampling_rate, mono=self.mono)
                audio_data = quantization_u_law(audio_data)
                # process data (skipping right now)
                # --------------------------------#
                #                                 #
                #              TODO               # 
                #              TODO               #
                #                                 #
                # --------------------------------#
                processed_files.append(audio_data)
                self.sample_loc_util.append(self.sample_loc_util[-1] + (len(audio_data)-self.input_length))
        np.savez(out_file, *processed_files)
    
    def __len__(self):
        return self.sample_loc_util[-1]

    def __getitem__(self, idx):
        """
            args: 
            --- index (idx)

            --- locates the audio segment the index lies in and then the local index that is the index in that audio segment from where a sample 
            --- of inputs and targets will be retrieved 
        """
        upper_bound=0
        data_idx = 0
        for i in range(1, len(self.sample_loc_util)):
            if idx < self.sample_loc_util[i]:
                data_idx=i-1
                break
        local_idx = idx - self.sample_loc_util[data_idx]
        
        audio_segment = self.data[f"arr_{data_idx}"]

        input_seq = audio_segment[local_idx : local_idx + self.input_length]
        target_seq = audio_segment[local_idx+self.input_length : local_idx+self.input_length + self.target_length]
        
        return torch.tensor(input_seq, dtype=torch.float32), torch.tensor(target_seq, dtype=torch.float32)
    
    def quantization_u_law(x):
        """
        quantizing 16 bit audio inputs to 0-255 range using meu law quantization
        assuming the audio input is already normalized in range [-1, 1]
        """
        x = np.array(x)
        u = 255
        mu_law_output = np.sign(x) * (np.log(1 + u*np.abs(x)))/(np.log(1+u))
        quantized = ((mu_law_output + 1)/2 * 255).astype(np.uint8)
        return torch.tensor(quantized)

In [24]:
dataset_path = r"C:\Users\tanbi\door\hall_projects\audio_replicate\models\waveNET\Datasets"
out_path = r"C:\Users\tanbi\door\hall_projects\audio_replicate\models\waveNET\data\processed_data2"
dataset = WaveNetDataset(dataset_path, out_path, input_length=16000, target_length=1)

In [25]:
len(dataset)

2832018

In [27]:
input, target = dataset[0]
input, target

(tensor([127., 127., 127.,  ..., 127., 127., 126.]), tensor([127.]))

In [28]:
dataloader_testing = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# data valid uptil index "-16002", have to fix it and make it valid for all cases but hopefull should work fine by hard coding and ignoring this case
dataloader_testing.dataset[-16002]

(tensor([148., 142., 143.,  ..., 108., 113., 129.]), tensor([115.]))

In [32]:
print(dataset[4])

(tensor([-4.6272e-06, -2.7477e-06,  4.4635e-06,  ...,  1.8226e-05,
         3.0116e-05,  1.9175e-06]), tensor([-7.7950e-05]))
