In [20]:
import librosa
from madmom.audio.signal import Signal
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import torch

def spec_extraction(file_name, win_size):
    # print(currentFilePath)

    x_test = []

    # y, sr = librosa.load(file_name, sr=8000)
    # *********** madmom.Signal() is faster than librosa.load() ***********
    y = Signal(file_name, sample_rate=8000, dtype=np.float32, num_channels=1)
    S = librosa.core.stft(y, n_fft=1024, hop_length=80*1, win_length=1024)
    x_spec = np.abs(S)
    x_spec = librosa.core.power_to_db(x_spec, ref=np.max)
    x_spec = x_spec.astype(np.float32)
    num_frames = x_spec.shape[1]

    # for padding
    padNum = num_frames % win_size
    if padNum != 0:
        len_pad = win_size - padNum
        padding_feature = np.zeros(shape=(513, len_pad))
        x_spec = np.concatenate((x_spec, padding_feature), axis=1)
        num_frames = num_frames + len_pad

    for j in range(0, num_frames, win_size):
        x_test_tmp = x_spec[:, range(j, j + win_size)].T
        x_test.append(x_test_tmp)
    x_test = np.array(x_test)

    # for normalization

    x_train_mean = np.load('x_data_mean_total_31.npy')
    x_train_std = np.load('x_data_std_total_31.npy')
    x_test = (x_test-x_train_mean)/(x_train_std+0.0001)
    x_test = x_test[:, :, :, np.newaxis]

    return x_test, x_spec

def spec_extraction_torch(file_name, win_size):
    y = Signal(file_name, sample_rate=8000, dtype=np.float32, num_channels=1)
    S = librosa.core.stft(y, n_fft=1024, hop_length=80*1, win_length=1024)
    x_spec = np.abs(S)
    x_spec = librosa.core.power_to_db(x_spec, ref=np.max)
    x_spec = x_spec.astype(np.float32)
    num_frames = x_spec.shape[1]
    
    
    return

In [16]:
file_name = '/home/svcapp/userdata/flo_new_music/audio_20201228/400/185/400185539.aac'

In [14]:
x_test, x_spec = spec_extraction(path, win_size=31)

In [15]:
x_test.shape, x_spec.shape

((680, 31, 513, 1), (513, 21080))

In [17]:
y = Signal(file_name, sample_rate=8000, dtype=np.float32, num_channels=1)
S = librosa.core.stft(y, n_fft=1024, hop_length=80*1, win_length=1024)
x_spec = np.abs(S)
x_spec = librosa.core.power_to_db(x_spec, ref=np.max)
x_spec = x_spec.astype(np.float32)
num_frames = x_spec.shape[1]

In [22]:
x_tensor = torch.Tensor(x_spec)

In [39]:
torch.nn.functional.pad(x_tensor, (0, 31 - x_spec.shape[1]%31)).reshape(513, -1, 31).unsqueeze(-1).shape

torch.Size([513, 680, 31, 1])

In [32]:
x_spec.shape[1]%31

29