In [18]:
import torch
import torchaudio
from resemblyzer import VoiceEncoder, preprocess_wav
from thop import profile


In [23]:
# Load audio file
waveform, sample_rate = torchaudio.load("data/20250306170609.wav")

# Resample if necessary
if sample_rate != 16000:
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

# d-vector output encoder
speaker_encoder = VoiceEncoder(device="cpu")
# Convert to numpy array and preprocess for d-vector
waveform_preprocessed = preprocess_wav(waveform.numpy().squeeze())
speaker_embedding = torch.from_numpy(speaker_encoder.embed_speaker([waveform_preprocessed])).unsqueeze(0)
speaker_embedding.shape

Loaded the voice encoder model on cpu in 0.01 seconds.


torch.Size([1, 256])

In [24]:
print(speaker_embedding)

tensor([[0.0000, 0.0091, 0.0929, 0.0000, 0.1755, 0.0000, 0.0151, 0.0747, 0.0000,
         0.0000, 0.0299, 0.0091, 0.0651, 0.0000, 0.0908, 0.0000, 0.0587, 0.1331,
         0.0000, 0.0061, 0.0799, 0.1113, 0.0000, 0.0000, 0.0000, 0.1619, 0.0000,
         0.0189, 0.0014, 0.0000, 0.1776, 0.0296, 0.0000, 0.0000, 0.0393, 0.1140,
         0.0010, 0.0222, 0.0397, 0.0787, 0.0000, 0.0000, 0.0632, 0.1360, 0.0815,
         0.0000, 0.0022, 0.0000, 0.0350, 0.0000, 0.0000, 0.0454, 0.0000, 0.0000,
         0.0000, 0.0033, 0.0491, 0.0000, 0.0534, 0.1003, 0.0000, 0.0000, 0.0008,
         0.2078, 0.0559, 0.0000, 0.0469, 0.0011, 0.0000, 0.0015, 0.0099, 0.0227,
         0.0000, 0.0544, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0619, 0.0000,
         0.0000, 0.1959, 0.0169, 0.0000, 0.0000, 0.0000, 0.0837, 0.0000, 0.0577,
         0.0648, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0274, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0532, 0.0705, 0.0008, 0.0134, 0.0023, 0.0034,
         0.0016, 0.1183, 0.0

In [26]:
one = torch.tensor([[0.5, 2.0]])
print(one.shape)
print(one)

torch.Size([1, 2])
tensor([[0.5000, 2.0000]])


In [None]:
from model import SpeakerBeamSS
batch_size = 1
input_len = 16000  # 1秒分 @16kHz
mixture = torch.randn(batch_size, 1, input_len)

model = SpeakerBeamSS()
with torch.no_grad():
    out = model(mixture, speaker_embedding)
    print("Input:", mixture.shape, "Output:", out.shape)
    flops, params = profile(model, inputs=(mixture, speaker_embedding))
    print(f"FLOPs: {flops / 1e9:.2f}G, Params: {params / 1e6:.2f}M")