In [1]:
import librosa, numpy as np

y, sr = librosa.load("data/die_with_a_smile_trimmed_51s.wav", mono=False)   # shape (2, N)
print("Loaded audio with shape:", y.shape, "and sample rate:", sr)
L, R = y[0], y[1]
corr = np.corrcoef(L, R)[0,1]
lr_diff_rms = np.sqrt(np.mean((L-R)**2))
print("L-R corr:", corr, " | RMS(L-R):", lr_diff_rms)

Loaded audio with shape: (2, 1122784) and sample rate: 22050
L-R corr: 0.7761384699592758  | RMS(L-R): 0.13734084


In [2]:
import soundfile as sf

M = 0.5*(L+R)   # vocal이 센터면 여기로 더 모임
S = 0.5*(L-R)   # 스테레오 성분이 여기로

sf.write("results/baseline_mid_vocalish.wav", M, sr)
sf.write("results/baseline_side_instrish.wav", S, sr)


In [3]:
from sklearn.decomposition import FastICA

X = y.T  # (N, 2)

ica = FastICA(n_components=2, random_state=0, whiten='unit-variance', max_iter=1000)
S = ica.fit_transform(X)  # (N, 2)
sf.write("results/fastica_source1.wav", S[:,0]/np.max(np.abs(S[:,0])+1e-9)*0.98, sr)
sf.write("results/fastica_source2.wav", S[:,1]/np.max(np.abs(S[:,1])+1e-9)*0.98, sr)


In [4]:
import pyroomacoustics as pra


# 2) STFT
n_fft = 2048
hop   = 512
win   = "hann"
XL = librosa.stft(L, n_fft=n_fft, hop_length=hop, window=win)   # (F, T)
XR = librosa.stft(R, n_fft=n_fft, hop_length=hop, window=win)   # (F, T)

# 3) (F, T, C) 형태로 쌓기
X = np.stack([XL, XR], axis=2).astype(np.complex64)             # (F, T, 2)

# 4) AuxIVA 실행 (2 소스 가정)
Y = pra.bss.auxiva(X, n_src=2, n_iter=30)                       # (F, T, 2)

# 5) iSTFT로 시간영역 복원
s1 = librosa.istft(Y[...,0], hop_length=hop, window=win)
s2 = librosa.istft(Y[...,1], hop_length=hop, window=win)

# 6) 정규화 및 저장 (스케일 불확정성 보정)
def norm(x):
    peak = np.max(np.abs(x)) + 1e-9
    return (x/peak)*0.98
sf.write("results/auxiva_source1.wav", norm(s1), sr)
sf.write("results/auxiva_source2.wav", norm(s2), sr)


In [None]:
X = np.stack([
    librosa.stft(y[0], n_fft=2048, hop_length=512),
    librosa.stft(y[1], n_fft=2048, hop_length=512)
], axis=2).astype(np.complex64)  # (F,T,2)

# ILRMA: n_components_per_source를 다르게 (예: vocal=8, instr=24)
Y = pra.bss.ilrma(X, n_src=2, n_components= [8, 24], n_iter=50, proj_back=True)

s1 = librosa.istft(Y[...,0], hop_length=512)
s2 = librosa.istft(Y[...,1], hop_length=512)
sf.write("results/ilrma_s1.wav", s1/np.max(np.abs(s1)+1e-9)*0.98, sr)
sf.write("results/ilrma_s2.wav", s2/np.max(np.abs(s2)+1e-9)*0.98, sr)
