## 1. Install Dependencies and Import Libraries

In [None]:
!pip install -q stable-audio-tools torch torchaudio einops numpy

import torch
import torchaudio
from pathlib import Path
from typing import Optional, Dict, Any
import time
from einops import rearrange
from stable_audio_tools import get_pretrained_model
from stable_audio_tools.inference.generation import generate_diffusion_cond

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")

## 2. Set Up GPU and Model Configuration

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if device == "cuda":
    torch.cuda.empty_cache()
    torch.cuda.set_per_process_memory_fraction(0.9)
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

output_dir = Path("/kaggle/working/generated_audio")
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {output_dir}")

## 3. Initialize the Stable Audio Model

In [None]:
print("Loading Stable Audio model... (this may take a moment)")
start_time = time.time()

model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
model = model.to(device)

sample_rate = model_config["sample_rate"]
sample_size = model_config["sample_size"]

elapsed = time.time() - start_time
print(f"✓ Model loaded in {elapsed:.1f}s")
print(f"  Sample rate: {sample_rate} Hz")
print(f"  Sample size: {sample_size}")
print(f"  Device: {device}")

## 4. Generate Sound Effects with GPU Acceleration

In [None]:
def generate_audio(
    prompt: str,
    duration: int = 30,
    guidance_scale: float = 7.0,
    seed: Optional[int] = None,
) -> torch.Tensor:
    """Generate audio using Stable Audio on GPU."""
    conditioning = [{
        "prompt": prompt,
        "seconds_start": 0,
        "seconds_total": duration,
    }]
    
    safe_seed = seed if seed is not None else 0
    if safe_seed == -1 or safe_seed > 2**31 - 1:
        safe_seed = 42
    
    output = generate_diffusion_cond(
        model,
        steps=100,
        cfg_scale=guidance_scale,
        conditioning=conditioning,
        sample_size=sample_size,
        sigma_min=0.3,
        sigma_max=500,
        sampler_type="dpmpp-3m-sde",
        device=device,
        seed=safe_seed,
    )
    
    output = rearrange(output, "b d n -> d (b n)")
    output = output.to(torch.float32).cpu()
    
    max_val = torch.max(torch.abs(output)).item()
    if max_val > 1e-6:
        output = output / max_val
    
    output = torch.clamp(output, -1.0, 1.0)
    return output


# Test with your prompt
prompt = "distant artillery shelling multiple times"
print(f"Generating: '{prompt}'")
print(f"Duration: 30s | Guidance Scale: 7.0")
print("-" * 50)

start_time = time.time()
audio = generate_audio(prompt, duration=30, guidance_scale=7.0)
elapsed = time.time() - start_time

print(f"✓ Generated in {elapsed:.1f}s")
print(f"  Shape: {audio.shape}")
print(f"  Min: {audio.min():.6f} | Max: {audio.max():.6f}")

## 5. Save and Verify Generated Audio

In [None]:
output_audio = audio.float()
output_scaled = (output_audio.numpy() * 32767.0).astype("int16")

output_path = output_dir / "artillery_shelling.wav"
if output_scaled.ndim == 1:
    output_scaled = output_scaled.reshape(1, -1)
torchaudio.save(str(output_path), torch.from_numpy(output_scaled), sample_rate)

print(f"✓ Saved to: {output_path}")
print(f"  File size: {output_path.stat().st_size / 1024 / 1024:.1f}MB")
print(f"  Duration: {output_scaled.shape[1] / sample_rate:.1f}s")

from IPython.display import Audio
Audio(str(output_path))

## 6. Batch Process Multiple Prompts

In [None]:
prompts = [
    {"text": "distant artillery shelling multiple times", "duration": 30},
    {"text": "heavy rain thunderstorm with wind", "duration": 30},
    {"text": "eerie dark ambient drone sound", "duration": 30},
]

print(f"Batch processing {len(prompts)} prompts...")
print("=" * 60)

results = []
for i, item in enumerate(prompts, 1):
    prompt = item["text"]
    duration = item["duration"]
    
    print(f"\n[{i}/{len(prompts)}] Generating: '{prompt}'")
    print(f"  Duration: {duration}s")
    
    try:
        start = time.time()
        audio = generate_audio(prompt, duration=duration, guidance_scale=7.0)
        elapsed = time.time() - start
        
        output_audio = audio.float()
        output_scaled = (output_audio.numpy() * 32767.0).astype("int16")
        
        safe_filename = prompt.replace(" ", "_")[:40] + ".wav"
        output_path = output_dir / safe_filename
        
        if output_scaled.ndim == 1:
            output_scaled = output_scaled.reshape(1, -1)
        torchaudio.save(str(output_path), torch.from_numpy(output_scaled), sample_rate)
        
        print(f"  ✓ Generated in {elapsed:.1f}s")
        print(f"  ✓ Saved to: {output_path.name}")
        
        results.append({"prompt": prompt, "file": output_path.name, "time": elapsed})
        
        if device == "cuda":
            torch.cuda.empty_cache()
            
    except Exception as e:
        print(f"  ✗ Error: {e}")

print("\n" + "=" * 60)
print(f"Batch complete! Generated {len(results)} audio files")
if results:
    print(f"Total time: {sum(r['time'] for r in results):.1f}s")
    print(f"Average: {sum(r['time'] for r in results) / len(results):.1f}s per prompt")
print(f"\nOutput directory: {output_dir}")