In [38]:
import tensorflow as tf
import numpy as np
import librosa
import os
import pyroomacoustics as pra
from nnAudio import Spectrogram

In [39]:
import torch

def nn_stft(x):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x = torch.tensor(x, device=device).float()
    spec_layer = Spectrogram.STFT(output_format="Magnitude")
    return spec_layer(x)

In [40]:
widmung_file = "/Users/llewyn/Documents/recordings/TrifonovWidmung3.m4a"
widmung, sr = librosa.load(widmung_file)



In [41]:
dataset_dir = "/Users/llewyn/Documents/data/stft/original"
sample_file = "MIDI-Unprocessed_08_R3_2008_01-05_ORIG_MID--AUDIO_08_R3_2008_wav--1.wav-0.npy"

stft = np.load(os.path.join(dataset_dir, sample_file))

In [42]:
ir_dir = "/Users/llewyn/Documents/data/irs/hearst/Position 1/Normalized/Stereo Pairs"
sample_ir = "HEARST_P1_BLD+BRU.wav"

ir, sr = librosa.load(os.path.join(ir_dir, sample_ir))

In [43]:
ir_stft = librosa.stft(ir)
ir_stft_ = nn_stft(ir)

STFT kernels created, time used = 0.0571 seconds


In [48]:
from scipy.signal import convolve2d, fftconvolve

In [49]:
from IPython.display import Audio 

In [50]:
from pyroomacoustics.directivities import (
    DirectivityPattern,
    DirectionVector,
    CardioidFamily,
)
import random

def get_room_impulse():
    # The desired reverberation time and dimensions of the room
    rt60 = 3  # seconds
    time_range = (1.5, 4)
    
    length, width, height = random.uniform(35,45),random.uniform(12,25),random.uniform(15,28)
    
    room_dim = [length, width, height]  # meters

    # We invert Sabine's formula to obtain the parameters for the ISM simulator
    e_absorption, max_order = pra.inverse_sabine(random.uniform(1.5, 4), room_dim)

    # Create the room
    room = pra.ShoeBox(
        room_dim, fs=22050, materials=pra.Material(e_absorption), max_order=max_order
    )
    
    dir_obj = CardioidFamily(
        orientation=DirectionVector(azimuth=90, colatitude=15, degrees=True),
        pattern_enum=DirectivityPattern.HYPERCARDIOID,
    )
    
    mic_x, mic_y, mic_z = random.uniform(0, length), random.uniform(0, width), random.uniform(0, height)
    
    room.add_source(position=[random.uniform(3, 10), random.uniform(3, 10), random.uniform(.9, 1.3)], directivity=dir_obj)
    room.add_microphone(loc=[mic_x, mic_y, mic_z], directivity=dir_obj)
    
    
    return room

In [51]:
room = get_room_impulse()



In [52]:
room.compute_rir()

In [53]:
ir = room.rir[0][0]

In [54]:
len(ir)

125709

In [55]:
widmung_ = fftconvolve(widmung, ir)

In [112]:
def stft_tf(x, frame_length=2048, frame_step=512):
    pad_amount = 2 * (frame_length - frame_step)
    x = tf.pad(x, [[pad_amount // 2, pad_amount // 2]], 'REFLECT')
    
    f = tf.signal.frame(x, frame_length, frame_step, pad_end=False)
    w = tf.signal.hann_window(frame_length, periodic=True)
    spectrograms_T = tf.signal.rfft(tf.cast(f, 'float32') * w, fft_length=[frame_length])
        
    return spectrograms_T

In [113]:
stft_tf_sample = stft_tf(widmung_[10 * sr : 12 * sr])

In [115]:
stft_librosa_sample = librosa.stft(widmung_[10 * sr: 12 * sr], hop_length=512, win_length=2048)