In [3]:
import sounddevice as sd
import numpy as np
import whisper
import wavio
import time
import scipy.io.wavfile as wav
from vosk import Model, KaldiRecognizer
import json

AUDIO TO TEXT TRASNCRIPTION

In [5]:
def record_audio(filename="output.wav", duration=5, samplerate=16000):
    """Record audio from mic and save to a file"""
    print(f"🎙️ Recording for {duration} seconds...")
    recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16)
    sd.wait()  # Wait until recording finishes
    wavio.write(filename, recording, samplerate, sampwidth=2)
    print(f"✅ Saved audio as {filename}")
    return filename

In [8]:
audio_file = record_audio(duration=5)

🎙️ Recording for 5 seconds...
✅ Saved audio as output.wav


In [4]:
def transcribe_audio(filename):
    """Transcribe audio using Whisper"""
    print("📝 Transcribing...")
    result = model.transcribe(filename)
    text = result["text"].strip()
    return text

In [6]:
model = whisper.load_model("base")
audio_file = "output.wav"
print(audio_file)
text = transcribe_audio(audio_file)

output.wav
📝 Transcribing...


In [7]:
text

"It's not a big deal that I am trying to do this for ourselves."

AUDIO EMBEDDINGS

In [19]:
import torch
import torchaudio
from transformers import Wav2Vec2Model, Wav2Vec2Processor
import soundfile as sf

torchaudio.set_audio_backend("soundfile")


def load_audio(path):
    try:
        waveform, sr = torchaudio.load(path)
    except RuntimeError:
        # fallback to soundfile
        data, sr = sf.read(path, dtype="float32")
        waveform = torch.tensor(data).unsqueeze(0) if data.ndim == 1 else torch.tensor(data).T
    return waveform, sr


  torchaudio.set_audio_backend("soundfile")


In [None]:
# Load model + processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
def get_audio_embedding(audio_file):
    waveform, sr = load_audio(audio_file)
    inputs = processor(waveform, sampling_rate=sr, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling over time
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

In [24]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import soundfile as sf
import torch

# load audio
def load_audio(path):
    data, sr = sf.read(path, dtype="float32")
    if data.ndim > 1:  # stereo -> mono
        data = data.mean(axis=1)
    waveform = torch.tensor(data)  # [time]
    return waveform, sr

waveform, sr = load_audio("output.wav")
print(waveform.shape, sr) 


processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# prepare inputs
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

# mean-pool over time for fixed-length embedding
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

print("✅ Embedding shape:", embedding.shape)
print("First 10 dims:", embedding[:10])


torch.Size([80000]) 16000


Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Embedding shape: (768,)
First 10 dims: [-0.01119173  0.01588007 -0.08928814 -0.07473806  0.05007917 -0.08230261
  0.05800665 -0.0366033   0.15863574 -0.22816195]


ACOUSTIC FEATURES

In [25]:
import librosa
import numpy as np

y, sr = librosa.load("output.wav", sr=None)

# Pitch (tone) using librosa.yin
pitch = librosa.yin(y, fmin=50, fmax=300)
mean_pitch = np.mean(pitch)

# Intensity (RMS Energy)
rms = librosa.feature.rms(y=y)[0]
mean_rms = np.mean(rms)

# MFCCs (timbre)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
mfcc_means = np.mean(mfccs, axis=1)

print("Mean Pitch (Hz):", mean_pitch)
print("Mean Loudness (RMS):", mean_rms)
print("MFCC shape:", mfccs.shape)


Mean Pitch (Hz): 146.02637684880412
Mean Loudness (RMS): 0.09505295
MFCC shape: (13, 157)


SPEECH EMOTION RECOGNITION

In [33]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
import torch, librosa
import torch.nn.functional as F

try:
    del model
except NameError:
    pass

extractor = AutoFeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "superb/wav2vec2-base-superb-er", 
    trust_remote_code=True, 
    use_safetensors=True
)

# Load audio
y, sr = librosa.load("output.wav", sr=16000)

# Extract features
inputs = extractor(y, sampling_rate=16000, return_tensors="pt", padding=True)

with torch.no_grad():
    logits = model(**inputs).logits

# Convert logits → probabilities
probs = F.softmax(logits, dim=-1).squeeze()

# Get labels
labels = model.config.id2label

# Print probabilities
for i, p in enumerate(probs):
    print(f"{labels[i]}: {p.item():.4f}")

# Most likely emotion
pred = torch.argmax(probs).item()
print("\nPredicted Emotion:", labels[pred])

Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


neu: 0.0628
hap: 0.0020
ang: 0.9352
sad: 0.0000

Predicted Emotion: ang


In [31]:
import torch
print(torch.__version__)  # should be >= 2.6


2.5.1+cu121
