s1301a - 10:03 min file

s02-1 - 22 sec file

Window size is the amount of time over which a waveform is sampled, known as time record, expressed in samples. For example, a window size of 128 samples at a sample rate of 48 kHz. Equals a time record of 128 samples x 1/48000 seconds = 0.0027 seconds or 2.7 milliseconds.

Window length is the length of the fixed intervals in which STFT divides the signal. Hop length is the length of the non-intersecting portion of window length. Overlap length is the length of the intersecting portion of the window length.

In [44]:
10*60+3, 603*16000

(603, 9648000)

In [45]:
22, 22*16000

(22, 352000)

In [40]:
# !pip install librosa

In [41]:
import soundfile as sf
import os
import pyfoal
import pypar
import librosa
import torchaudio
from scipy.io import wavfile


In [42]:
# filename = './s1301a.wav'
filename = './s02-1.wav'

### Soundfile

In [47]:
data, sr = sf.read(filename)


In [65]:
data

array([3.05175781e-04, 2.44140625e-04, 9.15527344e-05, ...,
       5.49316406e-04, 4.88281250e-04, 3.96728516e-04])

In [57]:
data.shape, sr

((348026,), 16000)

### Size

In [50]:
os.path.getsize(filename) // 2

348048

### Librosa

In [53]:
data_librosa, sr = librosa.load(filename, sr=16000)

In [54]:
data_librosa.shape

(348026,)

In [64]:
data_librosa

array([3.0517578e-04, 2.4414062e-04, 9.1552734e-05, ..., 5.4931641e-04,
       4.8828125e-04, 3.9672852e-04], dtype=float32)

In [55]:
librosa.get_duration(data_librosa)

 3.9672852e-04] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  """Entry point for launching an IPython kernel.


15.783492063492064

### Torchaudio

In [58]:
data_torch, sr = torchaudio.load(filename)

In [63]:
data_torch

tensor([[3.0518e-04, 2.4414e-04, 9.1553e-05,  ..., 5.4932e-04, 4.8828e-04,
         3.9673e-04]])

In [59]:
data_torch.shape

torch.Size([1, 348026])

### Scipy wavefile

In [60]:
sr, data_scipy = wavfile.read(filename)

In [61]:
data_scipy, data_scipy.shape

(array([10,  8,  3, ..., 18, 16, 13], dtype=int16), (348026,))

In [62]:
window_length_ms = 10

window_length = int((window_length_ms/1000)*sr)
print('Window length in samples ' + str(window_length))


Window length in samples 160


### Audio load

In [9]:
import emphases

In [10]:
audio = emphases.load.audio('../data/cache/Buckeye/wavs/s02-1.wav')

In [13]:
audio.shape, audio.shape[-1] // emphases.HOPSIZE

(torch.Size([1, 348026]), 2175)

### Mel Exp

In [1]:
import emphases
import torch

In [2]:
mel_loader = emphases.load.MelSpectrogram()


  emphases.NUM_MELS


In [3]:
audio = emphases.load.audio('../data/cache/Buckeye/wavs/s03-1.wav')
audio2 = emphases.load.audio('../data/cache/Buckeye/wavs/s11-1.wav')

In [4]:
mel_spectrogram1 = mel_loader.forward(audio)
mel_spectrogram1.shape

torch.Size([1, 80, 1352])

In [5]:
mel_spectrogram2 = mel_loader.forward(audio2)
mel_spectrogram2.shape

torch.Size([1, 80, 1502])

In [6]:
mel_spectrogram1.shape[-1]

1352

In [7]:
mel_spectrogram = [mel_spectrogram1, mel_spectrogram2]

In [8]:
mel_lengths = torch.tensor(
    [mel.shape[-1] for mel in mel_spectrogram], 
    dtype=torch.long)

max_mel_lengths = mel_lengths.max()

In [9]:
mel_lengths.max().item()

1502

### Checks

In [1]:
# import emphases
# import pypar
# import os
# import torch
# import functools

In [2]:
# train_stems = emphases.load.partition('Buckeye')['train']

In [3]:
# with open('../data/cache/Buckeye/annotation/s11-1.prom', 'r') as f:
#     data = f.read()
# # first line is header, skip it
# lines = [x.split('\t') for x in data.split('\n')[1:]]
# proms = torch.tensor([float(x[4]) for x in lines[:-1]])


In [4]:
# cache = emphases.CACHE_DIR / 'Buckeye'
# for stem in train_stems:
#     alignment = pypar.Alignment(
#         cache / 'alignment' / f'{stem}.TextGrid')
#     prominence = emphases.load.load_prominence(cache / 'annotation' / f'{stem}.prom')
# #     assert (len(alignment.word_bounds(emphases.SAMPLE_RATE)) == prominence.shape[0]), f'{stem} array length mismatch b/w input and ground truth'


In [5]:
# input_channels = emphases.NUM_MELS
# output_channels = 1
# hidden_channels = 128

# conv_fn = functools.partial(
#     torch.nn.Conv1d,
#     kernel_size=5,
#     padding='same')
# layers = torch.nn.Sequential(
#             conv_fn(input_channels, hidden_channels),
#             torch.nn.ReLU(),
#             conv_fn(hidden_channels, hidden_channels),
#             torch.nn.ReLU(),
#             conv_fn(hidden_channels, output_channels))
# sample_input = torch.randn(32, 80, 1300)
# layers(sample_input).shape

In [1]:
import emphases
import pypar
import os
import torch
import functools

In [2]:
train_loader, valid_loader = emphases.data.loaders('Buckeye', 'train', 'valid',gpu=None)

In [3]:
train_loader.dataset.lengths, train_loader.dataset.spectrogram_lengths

([769069, 456913, 559939, 408149, 585433, 438593, 654307, 696081, 608939],
 [1502, 892, 1093, 797, 1143, 856, 1278, 1359, 1189])

In [4]:
# dir(train_loader)

In [5]:
len(train_loader.dataset)

9

In [6]:
train_loader.batch_sampler.set_epoch(0// len(train_loader.dataset))

In [10]:
all_batches = []
for batch in train_loader:
    all_batches.append(batch)

892 tensor([892, 856, 892, 856, 892, 856, 892, 856, 892, 856, 892, 856, 892, 856,
        892, 856, 892, 856, 892, 856, 892, 856, 892, 856, 892, 856, 892, 856,
        892, 856, 892, 856, 892, 856, 892, 856, 892, 856, 892, 856, 892, 856,
        892, 856, 892, 856, 892, 856, 892, 856, 892, 856, 892, 856, 892, 856,
        892, 856, 892, 856, 892, 856, 892, 856])


56it [00:00, 276.45it/s]

1502 tensor([1278, 1359, 1143, 1093, 1189, 1502, 1278, 1359, 1143, 1093, 1189, 1502,
        1278, 1359, 1143, 1093, 1189, 1502, 1278, 1359, 1143, 1093, 1189, 1502,
        1278, 1359, 1143, 1093, 1189, 1502, 1278, 1359, 1143, 1093, 1189, 1502,
        1278, 1359, 1143, 1093, 1189, 1502, 1278, 1359, 1143, 1093, 1189, 1502,
        1278, 1359, 1143, 1093, 1189, 1502, 1278, 1359, 1143, 1093, 1189, 1502,
        1278, 1359, 1143, 1093])


64it [00:00, 275.92it/s]
64it [00:00, 213.73it/s]


In [12]:
all_batches

[(tensor([[[-1.1200e-02, -9.1553e-03, -1.0498e-02,  ...,  0.0000e+00,
             0.0000e+00,  0.0000e+00]],
  
          [[ 3.0518e-04,  2.4414e-04,  9.1553e-05,  ...,  0.0000e+00,
             0.0000e+00,  0.0000e+00]],
  
          [[ 3.6194e-02,  3.5675e-02,  3.6926e-02,  ...,  0.0000e+00,
             0.0000e+00,  0.0000e+00]],
  
          ...,
  
          [[ 3.0518e-04,  2.4414e-04,  9.1553e-05,  ...,  0.0000e+00,
             0.0000e+00,  0.0000e+00]],
  
          [[ 3.6194e-02,  3.5675e-02,  3.6926e-02,  ...,  0.0000e+00,
             0.0000e+00,  0.0000e+00]],
  
          [[ 7.7209e-03,  8.3618e-03,  1.0529e-02,  ...,  0.0000e+00,
             0.0000e+00,  0.0000e+00]]]),
  tensor([[[[-2.1887, -2.4864, -2.3280,  ...,  0.0000,  0.0000,  0.0000],
            [-2.0103, -2.3534, -2.2917,  ...,  0.0000,  0.0000,  0.0000],
            [-1.8716, -2.3865, -2.7684,  ...,  0.0000,  0.0000,  0.0000],
            ...,
            [-3.4213, -3.4281, -3.6195,  ...,  0.0000,  0.0000,  0

In [13]:
batch[0].shape

torch.Size([64, 1, 228442])

In [14]:
batch[1].shape

torch.Size([64, 1, 80, 892])

In [15]:
batch[2].shape

torch.Size([64, 1, 49])

In [20]:
len(batch[3])

64