In [None]:
# Import external tools:
import time 
import random
from matplotlib import pyplot as plt
import librosa
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from torch.utils.data.sampler import SubsetRandomSampler
from torchsummary import summary
import torchaudio
import soundfile as sf
from scipy import signal
import getpass
import pandas as pd
import numpy as np
import sys
import os
import importlib
from scipy.io import wavfile
from IPython.display import Audio
from datetime import datetime
from sklearn.manifold import TSNE


projectdir="/home/ubuntu/joanna/VAE-IR/"
datadir="/home/ubuntu/Data/"

# Add path of this project
sys.path.insert(0, projectdir+'src/')

# # Import and automatically reload my own modules:
import sig2ir_datasetprep as dsprep;importlib.reload(dsprep)
import helpers; importlib.reload(helpers)

Motivation: In VR/XR it might help to change the room acoustic properties of sound recorded in one space, so that it sounds as if it came from another space. This problem can be devided into two sub-tasks where the first one is to de-reverberate a signal and the second one is to convolve the de-reverberated signal with a new room impulse response to obtain a perceptually different space. However, the desired room impulse response is usually not explicitly given, so a part of the problem is to estimate a room impulse response from a reverberant signal recorded in a target acoustic space. 

Task: Given a mono, reverberant signal, estimate the room impulse response. 

Approach: Use deep learning to solve that problem. Use encoder-decoder architecture with the waveform at the input and room impulse response at the output. 

Data: Speech convolved with various impulse responses. 

In [None]:
# Create a data frame containing a list of all available audio files (will be used as audio pool in the dataset generation)
audiodatadir=datadir + "VCTK/wav48_silence_trimmed/"
filename="audio_VCTK_datura.csv"

import os
file_paths = []

# Traverse the directory and find all .flac files in subdirectories
for root, dirs, files in os.walk(audiodatadir):
    for file in files:
        if file.endswith('.flac'):
            file_path = os.path.join(root, file)
            file_paths.append(file_path)

# Create a DataFrame using the file_paths list
df_audio = pd.DataFrame({'filepath_sig': file_paths})
df_audio.loc[df_audio["filepath_sig"].str.contains("VCTK"),"database_sig"]="vctk"

df_audio.loc[df_audio["filepath_sig"].str.contains("mic1"),"mic"]=1
df_audio.loc[df_audio["filepath_sig"].str.contains("mic2"),"mic"]=2

df_audio.to_csv("../"+filename)

In [None]:
# Do some small changes in the dataframe storing impulse responses 
df_ir = pd.read_csv("../irstats_ARNIandBUT_datura.csv",index_col=0)
df_ir = df_ir.rename(columns={'filepath': 'filepath_ir'})
df_ir.loc[df_ir["filepath_ir"].str.contains("Arni"),"database_ir"]="arni"
df_ir.loc[df_ir["filepath_ir"].str.contains("BUT"),"database_ir"]="but"
df_ir.to_csv("../irstats_ARNIandBUT_datura.csv")
df_ir.head(10)

In [None]:
# Set random seed for NumPy, Pandas, and PyTorch
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# set up sources of RIRs and audios for dataset
AUDIO_INFO_FILE = "/home/ubuntu/joanna/VAE-IR/audio_VCTK_datura.csv"
IR_INFO_FILE = "/home/ubuntu/joanna/VAE-IR/irstats_ARNIandBUT_datura.csv"
SAMPLING_RATE=48e3

df_audiopool=pd.read_csv(AUDIO_INFO_FILE,index_col=0)
df_irs=pd.read_csv(IR_INFO_FILE,index_col=0)
df_irs=df_irs[df_irs["database_ir"]=="arni"]
df_irs=df_irs.head(10)

dataset=dsprep.Dataset_SpeechInSpace(df_audiopool,df_irs,N_per_ir=1000)

# # create a tag for dataset info file
# current_datetime = datetime.now()
# nametag = current_datetime.strftime("%d-%m-%Y_%H-%M")
# # save info about dataset
# dataset.dataset_info(nametag)


In [None]:
# Inspect a data point: 

sig, ir, dp, label=dataset[3000]
print(label["filepath_ir"])

# fig = plt.figure(figsize=(10, 4))
# plt.subplot(1,3,1)
# plt.plot(ir.T)
# plt.xlim((0,200000))
# plt.ylim((-1,1))
# plt.subplot(1,3,2)
# plt.plot(sig.T)
# plt.xlim((0,200000))
# plt.ylim((-1,1))
# plt.subplot(1,3,3)
# plt.plot(dp.squeeze([0]).T)
# plt.xlim((0,200000))
# plt.ylim((-1,1))
# plt.tight_layout()
# plt.show()






In [None]:
# PLAYBACK: ROOM IMPULSE RESPONSE
print(label["filepath_ir"])
Audio(ir,rate=48000)


In [None]:
# PLAYBACK: EXCERPT FROM SPEECH DATABASE
print(label["filepath_sig"])
Audio(sig,rate=48000)


In [None]:
# PLAYBACK: REVERBERANT SIGNAL 
Audio(dp.squeeze([0]),rate=48000)