In [2]:
import torch
import torchaudio
import torchaudio.transforms as T
from transformers import Wav2Vec2Processor, Wav2Vec2Model, HubertProcessor, HubertModel
from speechbrain.pretrained import EncoderASR
from performer_pytorch import Performer
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

OSError: /home/kennykguo/anaconda3/envs/deep-learning/lib/python3.8/site-packages/torchaudio/lib/libtorchaudio.so: undefined symbol: _ZN3c1010Dispatcher17runRecordFunctionERN2at14RecordFunctionESt17reference_wrapperIKNS_14FunctionSchemaEENS_11DispatchKeyE

In [None]:
# Configuration
AUDIO_FILE = "example.wav"
SAMPLE_RATE = 16000
N_FFT = 400
HOP_LENGTH = 160
N_MELS = 80
N_MFCC = 13

In [None]:
def plot_waveform(waveform, sr):
    plt.figure(figsize=(10, 3))
    plt.plot(waveform.numpy().T)
    plt.title("Raw Waveform")
    plt.xlabel("Samples")
    plt.ylabel("Amplitude")
    plt.show()

def plot_spectrogram(spec, title, ylabel):
    plt.figure(figsize=(10, 4))
    plt.imshow(spec.log2().squeeze().numpy(), aspect='auto', origin='lower')
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel("Frame")
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.show()

def visualize_embeddings(embeddings, title):
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(embeddings.squeeze().cpu().numpy())
    plt.figure(figsize=(8, 6))
    plt.scatter(reduced[:, 0], reduced[:, 1], alpha=0.5)
    plt.title(title + " (PCA Reduced)")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.show()

In [None]:
# Load and preprocess audio
waveform, orig_sr = torchaudio.load(AUDIO_FILE)
resampler = T.Resample(orig_sr, SAMPLE_RATE)
waveform = resampler(waveform)

print("\nRaw audio shape:", waveform.shape)
plot_waveform(waveform, SAMPLE_RATE)

In [4]:
# Feature extraction
mel_spec = T.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=N_FFT,
    hop_length=HOP_LENGTH,
    n_mels=N_MELS
)(waveform)

mfcc = T.MFCC(
    sample_rate=SAMPLE_RATE,
    n_mfcc=N_MFCC,
    melkwargs={
        'n_fft': N_FFT,
        'hop_length': HOP_LENGTH,
        'n_mels': N_MELS
    }
)(waveform)


print("\nMel spectrogram shape:", mel_spec.shape)
plot_spectrogram(mel_spec, "Mel Spectrogram", "Mel Bin")

print("\nMFCC shape:", mfcc.shape)
plot_spectrogram(mfcc, "MFCC", "Coefficient Index")


NameError: name 'T' is not defined

In [None]:


print("\nMel spectrogram shape:", mel_spec.shape)
plot_spectrogram(mel_spec, "Mel Spectrogram", "Mel Bin")

print("\nMFCC shape:", mfcc.shape)
plot_spectrogram(mfcc, "MFCC", "Coefficient Index")

# HuBERT Processing
hubert_processor = HubertProcessor.from_pretrained("facebook/hubert-base-ls960")
hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960")

hubert_inputs = hubert_processor(
    waveform.squeeze().numpy(),
    return_tensors="pt",
    sampling_rate=SAMPLE_RATE
).input_values

with torch.no_grad():
    hubert_output = hubert_model(hubert_inputs).last_hidden_state

print("\nHuBERT output shape:", hubert_output.shape)
visualize_embeddings(hubert_output, "HuBERT Embeddings")

# Wav2Vec 2.0 Processing
w2v_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
w2v_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

w2v_inputs = w2v_processor(
    waveform.squeeze().numpy(),
    return_tensors="pt",
    sampling_rate=SAMPLE_RATE
).input_values

with torch.no_grad():
    w2v_output = w2v_model(w2v_inputs).last_hidden_state

print("\nWav2Vec 2.0 output shape:", w2v_output.shape)
visualize_embeddings(w2v_output, "Wav2Vec 2.0 Embeddings")

# Conformer Processing (using SpeechBrain)
conformer_model = EncoderASR.from_hparams(
    source="speechbrain/asr-conformer-transformerlm-librispeech",
    savedir="pretrained_models/conformer"
)

log_mel = torch.log(mel_spec + 1e-6).squeeze(0).transpose(0, 1)
log_mel = (log_mel - log_mel.mean(dim=0)) / log_mel.std(dim=0)
conformer_input = log_mel.unsqueeze(0)

with torch.no_grad():
    conformer_output = conformer_model.encode_batch(conformer_input)

print("\nConformer output shape:", conformer_output.shape)
visualize_embeddings(conformer_output, "Conformer Embeddings")

# Performer Processing (example implementation)
performer = Performer(
    dim=64,
    depth=4,
    heads=8,
    dim_head=64,
    causal=True
)

performer_input = mel_spec.squeeze(0).transpose(0, 1).unsqueeze(0)
with torch.no_grad():
    performer_output = performer(performer_input)

print("\nPerformer output shape:", performer_output.shape)
visualize_embeddings(performer_output, "Performer Embeddings")