# Speaker Distance Estimation 

In [2]:
import os
import yaml
import torch
from torch.utils.data import DataLoader
import pandas as pd
import soundfile as sf

from GenDARA.sde_model.model import SdeTrainer

### Estimate speaker distance using trained model

In [5]:
# Get test audio files
test_audio_dir = './submission_folder/eval_2_speaker_distance_estimates/test_audio'
test_audio_files = os.listdir(test_audio_dir)
test_audio_files.sort()

# Make dataframe with test audio filenames. Add speaker distance estimations later
df = pd.DataFrame(test_audio_files, columns=['convolved_filename'])

# Make test dataloader
test_data = []
for file in test_audio_files:
    if file.endswith('.wav'):
        out, sr = sf.read(os.path.join(test_audio_dir, file), always_2d=True, dtype="float32")
        out = torch.as_tensor(out).T
        test_data.append(out)
test_data = torch.cat(test_data)
print(f'Test data shape [n files, time samples]: {test_data.shape}') # 480 audio, 320000 samples

test_loader = DataLoader(test_data, batch_size=16, shuffle=False, drop_last=False)

# Load model and config
checkpoint_filepath = './GenDARA/sde_model/baseline.ckpt' # load baseline SDE model: trained only on C4DM RIRs and VCTK speech
with open('./GenDARA/sde_model/config.yml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
print(config)
trainer = SdeTrainer.load_from_checkpoint(checkpoint_filepath,lr=config["lr"], sr=config["sr"], kernels = config['kernels'], n_grus = config['n_grus'], features_set = config['features_set'], att_conf = config['att_conf'])
model = trainer.model.cpu()  # move model weights to cpu


Test data shape [n files, time samples]: torch.Size([480, 320000])
{'att_conf': 'onSpec', 'features_set': 'all', 'kernels': 'freq', 'lr': 0.001, 'n_grus': 2, 'sr': 32000}


In [None]:
# Forward pass to get SDE predictions
model.eval()
with torch.no_grad():
    for i, x in enumerate(test_loader): # This loop takes me ~ 1 min on cpu
        pred, _, _, _ = model(x)
        if i == 0:
            preds = pred
        else:
            preds = torch.cat((preds, pred), dim=0)
            
preds = preds.detach().numpy()
df['pred_distance'] = preds
df.head()

Unnamed: 0,convolved_filename,pred_distance
0,audio_000.wav,5.666922
1,audio_001.wav,4.766877
2,audio_002.wav,3.373574
3,audio_003.wav,6.558952
4,audio_004.wav,9.486773


In [None]:
# Save the predictions to .csv for submission
save_csv = False # change this to True to save the csv
if save_csv:
    df.to_csv('./submission_folder/speaker_distance_estimates/test_speaker_distance_estimates.csv', index=False)