# MusicGen EMOTIFY Training 

**Critical Fixes Applied:**
1. ‚úÖ Disabled dropout (causes NaN in training mode)
2. ‚úÖ Correct conditioning structure (no unpacking)
3. ‚úÖ Proper batch tuple unpacking
4. ‚úÖ Pure Float32 training
5. ‚úÖ No gradient accumulation
6. ‚úÖ Conservative learning rate with warmup

In [None]:
# ============================================================
# Setup
# ============================================================

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torchaudio
from pathlib import Path
import numpy as np
from audiocraft.models import MusicGen
from audiocraft.modules.conditioners import ConditioningAttributes
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
# ============================================================
# Configuration - STABLE SETTINGS
# ============================================================

# Paths
AUDIO_DIR = "D:/EMOTIFY/audio"
EMOTIFY_CSV = "D:/EMOTIFY/summed_emotions.csv"

# Training - OPTIMIZED
BATCH_SIZE = 4
NUM_EPOCHS = 20                    # More epochs
SEGMENT_DURATION = 10.0
LEARNING_RATE = 5e-6               # ‚Üê Higher (was 1e-6)
MIN_LR = 1e-7                      # For cosine schedule
WARMUP_STEPS = 50                  # ‚Üê Shorter warmup (was 100)
MAX_GRAD_NORM = 1.0                # ‚Üê Less aggressive clipping (was 0.5)

# Cosine annealing schedule
def get_lr(step, total_steps, warmup_steps, base_lr, min_lr):
    if step < warmup_steps:
        return base_lr * (step + 1) / warmup_steps
    else:
        progress = (step - warmup_steps) / (total_steps - warmup_steps)
        return min_lr + 0.5 * (base_lr - min_lr) * (1 + np.cos(np.pi * progress))


# ‚úÖ CRITICAL: No mixed precision, no gradient accumulation
USE_AUTOCAST = False
GRADIENT_ACCUMULATION = 1

# Audio
SAMPLE_RATE = 32000
CODEBOOKS = 4
CARDINALITY = 2048

In [None]:
# ============================================================
# Data Processing
# ============================================================

def process_emotify_csv(csv_path):
    """Process EMOTIFY CSV to create text descriptions"""
    df = pd.read_csv(csv_path)
    
    emotion_cols = ['amazement', 'solemnity', 'tenderness', 'nostalgia',
                    'calmness', 'power', 'joyful_activation', 'tension', 'sadness']
    
    processed_data = []
    
    for idx, row in df.iterrows():
        track_id = row['track id']
        genre = row['genre']
        
        # Get emotion scores
        emotions = {col: row[col] for col in emotion_cols}
        sorted_emotions = sorted(emotions.items(), key=lambda x: x[1], reverse=True)
        top_emotions = [e[0] for e in sorted_emotions[:2] if e[1] > 0]
        
        # Create description
        if len(top_emotions) > 0:
            emotion_str = ", ".join(top_emotions)
            description = f"A {genre} music track characterized by {emotion_str}."
        else:
            description = f"A {genre} music track."
        
        processed_data.append({
            'track_id': track_id,
            'text': description,
            'genre': genre,
            'top_emotions': top_emotions
        })
    
    return pd.DataFrame(processed_data)

df_processed = process_emotify_csv(EMOTIFY_CSV)
print(f"\nProcessed {len(df_processed)} tracks")
print("\nSample descriptions:")
print(df_processed.head())

In [None]:
# ============================================================
# DATA AUGMENTATION
# ============================================================

class EmotifyDataset(Dataset):
    def __init__(self, df, audio_dir, segment_duration, sample_rate, augment=True):
        self.df = df.reset_index(drop=True)
        self.audio_dir = Path(audio_dir)
        self.segment_duration = segment_duration
        self.sample_rate = sample_rate
        self.segment_samples = int(segment_duration * sample_rate)
        self.augment = augment
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio_path = self.audio_dir / f"{row['track_id']}.mp3"
        
        audio, sr = torchaudio.load(audio_path)
        
        # Convert to mono
        if audio.shape[0] > 1:
            audio = audio.mean(dim=0, keepdim=True)
        
        # Resample
        if sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
            audio = resampler(audio)
        
        # Random segment (from different positions each time)
        if audio.shape[1] > self.segment_samples:
            start = torch.randint(0, audio.shape[1] - self.segment_samples, (1,)).item()
            audio = audio[:, start:start + self.segment_samples]
        else:
            padding = self.segment_samples - audio.shape[1]
            audio = F.pad(audio, (0, padding))
        
        # ‚úÖ DATA AUGMENTATION
        if self.augment:
            # Random gain (volume)
            gain = torch.FloatTensor(1).uniform_(0.8, 1.2).item()
            audio = audio * gain
            
            # Random noise (very small)
            if torch.rand(1).item() < 0.3:
                noise = torch.randn_like(audio) * 0.005
                audio = audio + noise
        
        # Normalize
        audio = audio / (audio.abs().max() + 1e-8)
        
        return (audio, row['text'])

dataset = EmotifyDataset(df_processed, AUDIO_DIR, SEGMENT_DURATION, SAMPLE_RATE)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
print(f"Dataset size: {len(dataset)} samples")
print(f"Batches per epoch: {len(dataloader)}")

In [None]:
# ============================================================
# Model Loading with DROPOUT FIX
# ============================================================

print("Loading MusicGen model...")
model = MusicGen.get_pretrained('facebook/musicgen-small')

# Convert to Float32 BEFORE moving to GPU
print("Converting to Float32...")
model.lm = model.lm.float()
model.compression_model = model.compression_model.float()

# ‚úÖ CRITICAL FIX: Disable ALL dropout layers
print("Disabling dropout layers...")
dropout_count = 0
for module in model.lm.modules():
    if isinstance(module, torch.nn.Dropout):
        module.p = 0.0
        dropout_count += 1
print(f"‚úì Disabled {dropout_count} dropout layers")

# Move to device
model.lm = model.lm.to(device)
model.compression_model = model.compression_model.to(device)

# Set modes
model.lm.train()
model.compression_model.eval()

# Count parameters
trainable_params = sum(p.numel() for p in model.lm.parameters() if p.requires_grad)
print(f"\n‚úì Model loaded successfully")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model dtype: {next(model.lm.parameters()).dtype}")
print(f"Model device: {next(model.lm.parameters()).device}")

In [None]:
# ============================================================
# IMPROVED TRAINING LOOP
# ============================================================

# Configuration
LEARNING_RATE = 5e-6
MIN_LR = 1e-7
WARMUP_STEPS = 50
MAX_GRAD_NORM = 1.0
NUM_EPOCHS = 20

# Calculate total steps
total_steps = NUM_EPOCHS * len(dataloader)

# Optimizer
optimizer = torch.optim.AdamW(
    model.lm.parameters(),
    lr=LEARNING_RATE,
    betas=(0.9, 0.95),
    weight_decay=0.1
)

def get_lr(step):
    """Cosine annealing with warmup"""
    if step < WARMUP_STEPS:
        return LEARNING_RATE * (step + 1) / WARMUP_STEPS
    else:
        progress = (step - WARMUP_STEPS) / (total_steps - WARMUP_STEPS)
        return MIN_LR + 0.5 * (LEARNING_RATE - MIN_LR) * (1 + np.cos(np.pi * progress))

# Disable dropout
for module in model.lm.modules():
    if isinstance(module, torch.nn.Dropout):
        module.p = 0.0
if hasattr(model.lm, 'cfg_dropout'):
    model.lm.cfg_dropout.p = 0.0

model.lm.train()
model.compression_model.eval()

training_history = {'loss': [], 'grad_norm': [], 'lr': []}
global_step = 0
best_loss = float('inf')

print(f"Total steps: {total_steps}")
print(f"Learning rate: {LEARNING_RATE} ‚Üí {MIN_LR}")
print("="*70)

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    print("-"*40)
    
    epoch_loss = 0.0
    num_batches = 0
    
    for batch_idx, batch in enumerate(dataloader):
        audio = batch[0].to(device)
        descriptions = batch[1]
        
        # Encode
        with torch.no_grad():
            codes, _ = model.compression_model.encode(audio)
        
        # Conditioning
        attrs = [ConditioningAttributes(text={'description': desc}) for desc in descriptions]
        
        # Forward
        output = model.lm.compute_predictions(codes, attrs, None)
        logits = output.logits
        mask = output.mask
        
        # Loss on valid positions
        B, K, T, card = logits.shape
        ce_loss = 0.0
        
        for k in range(K):
            logits_k = logits[:, k, :, :].reshape(-1, card)
            targets_k = codes[:, k, :].reshape(-1)
            mask_k = mask[:, k, :].reshape(-1)
            
            valid_logits = logits_k[mask_k]
            valid_targets = targets_k[mask_k]
            
            if valid_logits.numel() > 0:
                ce_loss += F.cross_entropy(valid_logits, valid_targets)
        
        loss = ce_loss / K
        
        if torch.isnan(loss) or torch.isinf(loss):
            continue
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.lm.parameters(), MAX_GRAD_NORM)
        
        if torch.isnan(grad_norm) or grad_norm > 100.0:
            continue
        
        # Update LR (cosine schedule)
        current_lr = get_lr(global_step)
        for param_group in optimizer.param_groups:
            param_group['lr'] = current_lr
        
        optimizer.step()
        global_step += 1
        
        # Track
        epoch_loss += loss.item()
        num_batches += 1
        training_history['loss'].append(loss.item())
        training_history['grad_norm'].append(grad_norm.item())
        training_history['lr'].append(current_lr)
        
        if batch_idx % 10 == 0:
            print(f"  Batch {batch_idx}: loss={loss.item():.4f}, grad={grad_norm:.2f}, lr={current_lr:.2e}")
    
    # Epoch summary
    avg_loss = epoch_loss / num_batches if num_batches > 0 else 0
    print(f"\nEpoch {epoch+1}: avg_loss={avg_loss:.4f}")
    
    # Save best model
    if avg_loss < best_loss:
        best_loss = avg_loss
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.lm.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
        }, 'musicgen_emotify_best.pt')
        print(f"‚úì New best model saved! (loss={avg_loss:.4f})")
    
    # Regular checkpoint
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.lm.state_dict(),
        'loss': avg_loss,
    }, f'musicgen_emotify_epoch{epoch+1}.pt')

print("\n" + "="*70)
print("‚úì TRAINING COMPLETE!")
print(f"Best loss: {best_loss:.4f}")
print("="*70)

In [None]:
# ============================================================
# Plot Training Metrics
# ============================================================

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss
axes[0, 0].plot(training_history['loss'])
axes[0, 0].set_title('Training Loss')
axes[0, 0].set_xlabel('Step')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].grid(True)

# Gradient norm
axes[0, 1].plot(training_history['grad_norm'])
axes[0, 1].set_title('Gradient Norm')
axes[0, 1].set_xlabel('Step')
axes[0, 1].set_ylabel('Grad Norm')
axes[0, 1].grid(True)

# Learning rate
axes[1, 0].plot(training_history['lr'])
axes[1, 0].set_title('Learning Rate')
axes[1, 0].set_xlabel('Step')
axes[1, 0].set_ylabel('LR')
axes[1, 0].grid(True)

# # Perplexity
# axes[1, 1].plot(training_history['perplexity'])
# axes[1, 1].set_title('Perplexity')
# axes[1, 1].set_xlabel('Step')
# axes[1, 1].set_ylabel('Perplexity')
# axes[1, 1].grid(True)

plt.tight_layout()
plt.savefig('training_metrics.png', dpi=150)
plt.show()

print("‚úì Metrics plotted and saved")

In [None]:
# ============================================================
# RELOAD TRAINED MODEL FROM CHECKPOINT
# ============================================================

import torch
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.modules.conditioners import ConditioningAttributes

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# 1. Load base model
print("Loading base MusicGen model...")
model = MusicGen.get_pretrained('facebook/musicgen-small')

# 2. Convert to Float32
model.lm = model.lm.float()
model.compression_model = model.compression_model.float()

# 3. Move to GPU
model.lm = model.lm.to(device)
model.compression_model = model.compression_model.to(device)

# 4. Load your trained weights
# Use the last epoch checkpoint (or best one)
checkpoint_path = "musicgen_emotify_best.pt"  # ‚Üê Change if different
print(f"Loading checkpoint: {checkpoint_path}")

checkpoint = torch.load(checkpoint_path, map_location=device)
model.lm.load_state_dict(checkpoint['model_state_dict'])

print(f"‚úì Loaded checkpoint from epoch {checkpoint.get('epoch', 'unknown')}")
print(f"‚úì Loss was: {checkpoint.get('loss', 'unknown')}")

# 5. Set to eval mode
model.lm.eval()
model.compression_model.eval()

print("\n‚úì Model ready for generation!")

In [None]:
# ============================================================
# GENERATE 10 SECOND MUSIC (FIXED!)
# ============================================================

model.lm.eval()

# ‚úÖ CRITICAL: Set duration BEFORE generating
model.set_generation_params(
    duration=10.0,      # 10 seconds (was defaulting to 1 second!)
    top_k=250,          # Sampling parameter
    top_p=0.0,          # Disable nucleus sampling
    temperature=1.0,    # Creativity (1.0 = balanced)
    cfg_coef=3.0        # How strongly to follow the prompt
)

# Test different prompts
prompts = [
    "A classical song that evokes great nostalgia and tenderness",
    "An electronic track with high energy and joyful activation",
    "A pop song with high amazement and some tension",
    "A rock song with high power and some sadness"
]

from IPython.display import Audio, display

for i, prompt in enumerate(prompts):
    print(f"\n{'='*60}")
    print(f"Generating: {prompt}")
    print('='*60)
    
    with torch.no_grad():
        audio = model.generate(
            descriptions=[prompt],
            progress=True
        )
    
    print(f"Generated shape: {audio.shape}")  # Should be [1, 1, 320000]
    
    # Save
    filename = f"generated_{i+1}.wav"
    torchaudio.save(filename, audio[0].cpu(), 32000)
    print(f"‚úì Saved: {filename}")
    
    # Play
    display(Audio(audio[0].cpu().numpy(), rate=32000))

In [None]:
# ============================================================
# GENERATE 10 SECOND MUSIC (FIXED!)
# ============================================================
'''
    emotion_cols = ['amazement', 'solemnity', 'tenderness', 'nostalgia',
                    'calmness', 'power', 'joyful_activation', 'tension', 'sadness']
'''

model.lm.eval()

# ‚úÖ CRITICAL: Set duration BEFORE generating
model.set_generation_params(
    duration=10.0,      # 10 seconds (was defaulting to 1 second!)
    top_k=250,          # Sampling parameter
    top_p=0.0,          # Disable nucleus sampling
    temperature=1.0,    # Creativity (1.0 = balanced)
    cfg_coef=3.0        # How strongly to follow the prompt
)

# Test different prompts
prompts = [
    "A classical song that evokes great nostalgia",
    "A classical song that evokes great tenderness",
    "A classical song that evokes great nostalgia and tenderness",
    
    "A classical song that evokes calmness",
    "A classical song that evokes solemnity",
    "A classical song that evokes calmness and solemnity",
    
    "A classical track with high joyful activation",
    "A classical track with high tension",
    "A classical track with high joyful activation and tension",
    
    "A classical song with high power",
    "A classical song with high sadness",
    "A classical song with high power and some sadness"
]

from IPython.display import Audio, display

for i, prompt in enumerate(prompts):
    print(f"\n{'='*60}")
    print(f"Generating: {prompt}")
    print('='*60)
    
    with torch.no_grad():
        audio = model.generate(
            descriptions=[prompt],
            progress=True
        )
    
    print(f"Generated shape: {audio.shape}")  # Should be [1, 1, 320000]
    
    # Save
    filename = f"generated_classical_emotional_comparison{i+1}.wav"
    torchaudio.save(filename, audio[0].cpu(), 32000)
    print(f"‚úì Saved: {filename}")
    
    # Play
    display(Audio(audio[0].cpu().numpy(), rate=32000))

In [None]:
# ============================================================
# BASE MODEL vs FINE-TUNED MODEL COMPARISON
# ============================================================

import torch
import torchaudio
from audiocraft.models import MusicGen
from IPython.display import Audio, display
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

# ============================================================
# 1. Load BASE model (no fine-tuning)
# ============================================================
print("Loading BASE model...")
base_model = MusicGen.get_pretrained('facebook/musicgen-small')

base_model.lm = base_model.lm.float()
base_model.compression_model = base_model.compression_model.float()

base_model.lm = base_model.lm.to(device)
base_model.compression_model = base_model.compression_model.to(device)

base_model.lm.eval()
base_model.compression_model.eval()

base_model.set_generation_params(duration=10.0, top_k=250, cfg_coef=3.0)
print("‚úì Base model loaded")

# ============================================================
# 2. Load FINE-TUNED model
# ============================================================
print("\nLoading FINE-TUNED model...")
finetuned_model = MusicGen.get_pretrained('facebook/musicgen-small')

finetuned_model.lm = finetuned_model.lm.float()
finetuned_model.compression_model = finetuned_model.compression_model.float()

finetuned_model.lm = finetuned_model.lm.to(device)
finetuned_model.compression_model = finetuned_model.compression_model.to(device)

checkpoint = torch.load('musicgen_emotify_best.pt', map_location=device)
finetuned_model.lm.load_state_dict(checkpoint['model_state_dict'])

finetuned_model.lm.eval()
finetuned_model.compression_model.eval()

finetuned_model.set_generation_params(duration=10.0, top_k=250, cfg_coef=3.0)
print(f"‚úì Fine-tuned model loaded (loss={checkpoint.get('loss', 'N/A'):.4f})")

# ============================================================
# 3. Test prompts
# ============================================================
prompts = [
    "A classical song that evokes great nostalgia",
    "A classical song that evokes great tenderness",
    "A classical song that evokes great nostalgia and tenderness",
    
    "A classical song that evokes calmness",
    "A classical song that evokes solemnity",
    "A classical song that evokes calmness and solemnity",
    
    "A classical track with high joyful activation",
    "A classical track with high tension",
    "A classical track with high joyful activation and tension",
    
    "A classical song with high power",
    "A classical song with high sadness",
    "A classical song with high power and some sadness"
]

# Create output directories
os.makedirs("comparison_base", exist_ok=True)
os.makedirs("comparison_finetuned", exist_ok=True)

# ============================================================
# 4. Generate from BOTH models
# ============================================================
for i, prompt in enumerate(prompts):
    print(f"\n{'='*70}")
    print(f"[{i+1}/{len(prompts)}] {prompt}")
    print('='*70)
    
    # Clean filename
    safe_name = prompt.replace(" ", "_").replace(",", "")[:50]
    
    # --- BASE MODEL ---
    print("\nüîµ BASE MODEL generating...")
    with torch.no_grad():
        base_audio = base_model.generate(descriptions=[prompt], progress=True)
    
    base_filename = f"comparison_base/{i+1:02d}_{safe_name}.wav"
    torchaudio.save(base_filename, base_audio[0].cpu(), 32000)
    print(f"‚úì Saved: {base_filename}")
    
    print("BASE MODEL output:")
    display(Audio(base_audio[0].cpu().numpy(), rate=32000))
    
    # --- FINE-TUNED MODEL ---
    print("\nüü¢ FINE-TUNED MODEL generating...")
    with torch.no_grad():
        finetuned_audio = finetuned_model.generate(descriptions=[prompt], progress=True)
    
    finetuned_filename = f"comparison_finetuned/{i+1:02d}_{safe_name}.wav"
    torchaudio.save(finetuned_filename, finetuned_audio[0].cpu(), 32000)
    print(f"‚úì Saved: {finetuned_filename}")
    
    print("FINE-TUNED MODEL output:")
    display(Audio(finetuned_audio[0].cpu().numpy(), rate=32000))
