In [1]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import torch
from torch import nn

class CantoneseMoodAnalyzer(nn.Module):
    def __init__(self):
        super().__init__()
        # Load Whisper Cantonese model
        self.processor = WhisperProcessor.from_pretrained("whisper-small-cantonese")
        self.whisper = WhisperForConditionalGeneration.from_pretrained("whisper-small-cantonese")

        # Add mood classification head
        self.mood_classifier = nn.Sequential(
            nn.Linear(self.whisper.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 7)  # 7 basic emotions: anger, disgust, fear, happiness, sadness, surprise, neutral
        )

    def forward(self, audio_input):
        # Process audio with Whisper
        features = self.processor(audio_input, return_tensors="pt", sampling_rate=16000)
        whisper_output = self.whisper(**features)

        # Extract hidden states for mood classification
        hidden_states = whisper_output.last_hidden_state.mean(dim=1)

        # Classify mood
        mood_logits = self.mood_classifier(hidden_states)
        return mood_logits

In [2]:
import torch.utils.data as data
import torchaudio

class CantoneseMoodDataset(data.Dataset):
    def __init__(self, audio_paths, mood_labels):
        self.audio_paths = audio_paths
        self.mood_labels = mood_labels

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        # Load audio file
        audio, sr = torchaudio.load(self.audio_paths[idx])

        # Resample if necessary (Whisper expects 16kHz)
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            audio = resampler(audio)

        return audio.squeeze(), self.mood_labels[idx]

In [3]:
def train_mood_analyzer(model, train_loader, num_epochs=10):
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        for batch_idx, (audio, labels) in enumerate(train_loader):
            optimizer.zero_grad()

            # Forward pass
            outputs = model(audio)
            loss = criterion(outputs, labels)

            # Backward pass
            loss.backward()
            optimizer.step()

            if batch_idx % 100 == 0:
                print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}")

Testing:

In [4]:
import torch
import torchaudio
from torch.utils.data import DataLoader

def test_single_audio():
    # Initialize model
    model = CantoneseMoodAnalyzer()
    model.eval()  # Set to evaluation mode

    # Load a test audio file
    test_audio_path = "m.mp3"
    audio, sr = torchaudio.load(test_audio_path)

    # Make prediction
    with torch.no_grad():
        mood_logits = model(audio)

    # Convert logits to mood prediction
    mood_labels = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
    predicted_mood = mood_labels[torch.argmax(mood_logits).item()]

    print(f"Predicted mood: {predicted_mood}")



In [5]:
def create_test_dataset():
    # Create small test dataset
    test_audio_paths = [
        "m.mp3"
    ]

    test_labels = [0, 3, 6]  # Example labels: angry, happy, neutral

    return CantoneseMoodDataset(test_audio_paths, test_labels)

In [6]:
def test_model():
    # Initialize model and test dataset
    model = CantoneseMoodAnalyzer()
    test_dataset = create_test_dataset()
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

    # Set model to evaluation mode
    model.eval()

    # Initialize metrics
    correct = 0
    total = 0

    # Mood labels for interpretation
    mood_labels = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']

    with torch.no_grad():
        for audio, labels in test_loader:
            # Make prediction
            outputs = model(audio)
            predictions = torch.argmax(outputs, dim=1)

            # Calculate accuracy
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

            # Print prediction for each sample
            print(f"True mood: {mood_labels[labels.item()]}")
            print(f"Predicted mood: {mood_labels[predictions.item()]}")
            print("---")

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")

In [7]:
def test_audio_processing():
    # Test audio loading and preprocessing
    test_dataset = create_test_dataset()

    # Get first sample
    audio, label = test_dataset[0]

    # Print audio properties
    print(f"Audio shape: {audio.shape}")
    print(f"Audio type: {audio.dtype}")
    print(f"Label: {label}")

    # Check for NaN values
    print(f"Contains NaN: {torch.isnan(audio).any()}")

    # Check audio range
    print(f"Audio min: {audio.min()}")
    print(f"Audio max: {audio.max()}")

In [8]:
import librosa
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor

# Load your specific audio file
y, sr = librosa.load('m.mp3', sr=16000)

MODEL_NAME = "alvanlii/whisper-small-cantonese"

processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)

processed_in = processor(y, sampling_rate=sr, return_tensors="pt")
gout = model.generate(
    input_features=processed_in.input_features,
    output_scores=True,
    return_dict_in_generate=True
)
transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0]
print("Transcription:", transcription)

# Simple mood detection based on the transcription
cantonese_mood_keywords = {
    '開心': 'happy',
    '高興': 'happy',
    '笑': 'happy',
    '傷心': 'sad',
    '嬲': 'angry',
    '怒': 'angry',
    '平靜': 'neutral',
    '驚': 'scared',
    '緊張': 'nervous'
}

# Determine mood from transcription
detected_mood = 'neutral'  # default mood
for keyword, mood in cantonese_mood_keywords.items():
    if keyword in transcription:
        detected_mood = mood
        break

print("Detected mood:", detected_mood)

  y, sr = librosa.load('m.mp3', sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: 'm.mp3'