# üéµ AI Sound Studio: The Imagination Machine

**Welcome to the Google AI Campus!**

Today we are experimenting with a **VAE (Variational Autoencoder)**. This AI has a "Brain" that can listen to sounds, compress them into a thought (Latent Space), and speak them back.

### üéõÔ∏è You have two controls:
1.  **The Input:** You can feed it a real sound, or tell it to "imagine" a new one from scratch.
2.  **The Pitch (f0):** You control the pitch of the AI's voice.
    * **Positive Numbers:** Normal pitch (High/Low).
    * **Negative Numbers:** ‚ö†Ô∏è **DANGER ZONE.** The AI was never taught negative pitch. If you go here, it might "hallucinate" strange sounds!

In [1]:
# @title üõ†Ô∏è Teacher Setup (Run this first!)
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import librosa
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import Audio, display, clear_output
import glob
import shutil

print("‚è≥ Initializing AI Studio...")

# 1. Download Student Audio Samples (From the provided Zip link)
ZIP_ID = "1NGwhaEbZ4LTaQ3qS6ph9MdJLeUu6hNDo"
AUDIO_DIR = '/content/student_samples'

if not os.path.exists(AUDIO_DIR):
    print("‚¨áÔ∏è Downloading Student Audio Samples...")
    !gdown $ZIP_ID -O student_samples.zip
    print("üì¶ Unzipping files...")
    !unzip -q student_samples.zip -d $AUDIO_DIR
    print("‚úÖ Audio Ready.")
else:
    print("‚úÖ Audio files already loaded.")

# 2. Download Model Weights
WEIGHTS_PATH = 'vae_model_state_dict.pth'
if not os.path.exists(WEIGHTS_PATH):
    print("‚¨áÔ∏è Downloading AI Brain Weights...")
    !gdown 1-1fxTb3yl22ZWsSxmnWY6n8WTKpYTeAS -O $WEIGHTS_PATH
else:
    print("‚úÖ Weights file found.")

# 3. Define VAE Architecture
device = 'cuda' if torch.cuda.is_available() else 'cpu'

class VAE(nn.Module):
    def __init__(self, x_dim, h_dim1, h_dim2, z_dim):
        super(VAE, self).__init__()
        # Encoder
        self.fc1 = nn.Linear(x_dim, h_dim1)
        self.fc2 = nn.Linear(h_dim1, h_dim2)
        self.fc31 = nn.Linear(h_dim2, z_dim)  # Mean
        self.fc32 = nn.Linear(h_dim2, z_dim)  # Log variance
        # Decoder
        self.fc4 = nn.Linear(z_dim+4, h_dim2)
        self.fcf0 = nn.Linear(1, 4)
        self.fc5 = nn.Linear(h_dim2, h_dim1)
        self.fc6 = nn.Linear(h_dim1, x_dim)

    def encoder(self, x):
        h = F.relu(self.fc1(x))
        h = F.relu(self.fc2(h))
        return self.fc31(h), self.fc32(h)

    def sampling(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu)

    def decoder(self, z, f0):
        z = torch.cat((z, self.fcf0(f0)), -1)
        h = F.relu(self.fc4(z))
        h = F.relu(self.fc5(h))
        return F.relu(self.fc6(h))

    def forward(self, x):
        x_in, f0 = x
        mu, log_var = self.encoder(x_in.view(-1, 513*460))
        z = self.sampling(mu, log_var)
        return self.decoder(z, f0), mu, log_var

# 4. Instantiate Models
print("üß† Building Brains...")
vae_pro = VAE(x_dim=513*460, h_dim1=512, h_dim2=256, z_dim=64).to(device)
try:
    state_dict = torch.load(WEIGHTS_PATH, map_location=device)
    vae_pro.load_state_dict(state_dict)
    print("‚úÖ Trained Robot Loaded.")
except Exception as e:
    print(f"‚ùå Error loading weights: {e}")

# Untrained model
vae_novice = VAE(x_dim=513*460, h_dim1=512, h_dim2=256, z_dim=64).to(device)
print("‚úÖ Untrained Robot Initialized.")

# 5. Helpers
HOP_LENGTH = 512
N_FFT = 1024
TARGET_FRAMES = 460
SR = 48000

def process_audio(file_path, f0_val):
    # Load and Crop/Pad
    y, _ = librosa.load(file_path, sr=SR)
    needed_samples = (TARGET_FRAMES - 1) * HOP_LENGTH
    if len(y) < needed_samples:
        y = np.pad(y, (0, needed_samples - len(y)))
    else:
        y = y[:needed_samples]

    # Spectrogram
    spec = np.abs(librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH))

    # Ensure shape 513x460
    if spec.shape[1] < TARGET_FRAMES:
        spec = np.pad(spec, ((0,0), (0, TARGET_FRAMES - spec.shape[1])))
    elif spec.shape[1] > TARGET_FRAMES:
        spec = spec[:, :TARGET_FRAMES]

    # Normalize
    scale = np.max(spec) + 1e-8
    spec_norm = spec / scale

    # Tensor Prep
    inp = torch.tensor(spec_norm).float().to(device).reshape(1, -1)
    f0_tensor = torch.tensor([[float(f0_val)]]).float().to(device)

    return y, inp, f0_tensor, scale

def reconstruct_audio(model, inp, f0_tensor, scale):
    with torch.no_grad():
        recon, _, _ = model((inp, f0_tensor))
    return decode_to_wave(recon, scale)

def generate_from_scratch(model, f0_val, scale=100.0):
    with torch.no_grad():
        z = torch.randn(1, 64).to(device)
        f0_tensor = torch.tensor([[float(f0_val)]]).float().to(device)
        recon = model.decoder(z, f0_tensor)
    return decode_to_wave(recon, scale)

def decode_to_wave(tensor_out, scale):
    spec = tensor_out.view(513, TARGET_FRAMES).cpu().numpy()
    spec = spec * scale
    y_inv = librosa.griffinlim(spec, n_fft=N_FFT, hop_length=HOP_LENGTH)
    return y_inv

print("‚ú® Ready for Class!")

‚è≥ Initializing AI Studio...
‚¨áÔ∏è Downloading Student Audio Samples...
Downloading...
From: https://drive.google.com/uc?id=1NGwhaEbZ4LTaQ3qS6ph9MdJLeUu6hNDo
To: /content/student_samples.zip
100% 6.55M/6.55M [00:00<00:00, 71.5MB/s]
üì¶ Unzipping files...
‚úÖ Audio Ready.
‚¨áÔ∏è Downloading AI Brain Weights...
Downloading...
From (original): https://drive.google.com/uc?id=1-1fxTb3yl22ZWsSxmnWY6n8WTKpYTeAS
From (redirected): https://drive.google.com/uc?id=1-1fxTb3yl22ZWsSxmnWY6n8WTKpYTeAS&confirm=t&uuid=c554b3f1-7f3a-4b9c-b97f-e4f4111beae9
To: /content/vae_model_state_dict.pth
100% 969M/969M [00:11<00:00, 80.9MB/s]
üß† Building Brains...
‚úÖ Trained Robot Loaded.
‚úÖ Untrained Robot Initialized.
‚ú® Ready for Class!


In [3]:
# @title üéõÔ∏è AI Dashboard

# --- Setup Files ---
AUDIO_DIR = '/content/student_samples'
files = sorted(glob.glob(f"{AUDIO_DIR}/**/*.wav", recursive=True) + glob.glob(f"{AUDIO_DIR}/**/*.mp3", recursive=True))

if not files:
    print("‚ö†Ô∏è No files found in zip? Creating dummy.")
    os.makedirs('demo', exist_ok=True)
    dummy = 'demo/synth.wav'
    import soundfile as sf
    sf.write(dummy, np.random.uniform(-0.5,0.5, 48000*2), 48000)
    files = [dummy]

# Use relative path (Folder/File) to distinguish duplicate filenames
names = [os.path.relpath(f, AUDIO_DIR) for f in files]
file_map = dict(zip(names, files))

# --- UI Widgets ---
mode_toggle = widgets.ToggleButtons(
    options=['Reconstruct Sound', 'Generate from Scratch'],
    description='Mode:',
    button_style=''
)

file_dd = widgets.Dropdown(options=names, description='üìÇ File:', disabled=False, layout=widgets.Layout(width='500px'))

f0_slider = widgets.FloatSlider(
    value=200.0, min=-200.0, max=800.0, step=10.0,
    description='Pitch (f0):',
    continuous_update=False,
    orientation='horizontal',
    layout=widgets.Layout(width='500px')
)

btn_orig = widgets.Button(description='‚ñ∂Ô∏è Play Original', button_style='info')
btn_untrained = widgets.Button(description='üé≤ Ask Untrained Robot', button_style='warning')
btn_trained = widgets.Button(description='üß† Ask Trained Robot', button_style='success')
out = widgets.Output()

# --- Interaction Logic ---
def on_mode_change(change):
    if change['new'] == 'Generate from Scratch':
        file_dd.disabled = True
        btn_orig.disabled = True
    else:
        file_dd.disabled = False
        btn_orig.disabled = False

mode_toggle.observe(on_mode_change, names='value')

def show_plot(y, title):
    fig, ax = plt.subplots(figsize=(8, 2))
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y, n_fft=1024, hop_length=512)), ref=np.max)
    librosa.display.specshow(D, sr=48000, hop_length=512, x_axis='time', y_axis='hz', ax=ax, cmap='magma')
    ax.set_title(title)
    plt.show()

def on_orig(b):
    with out:
        clear_output()
        if mode_toggle.value == 'Generate from Scratch':
            return
        if not file_dd.value:
            print("Please select a file first.")
            return

        path = file_map[file_dd.value]
        print(f"‚ñ∂Ô∏è Playing Original: {file_dd.value}...")
        # We pass 0 as f0 just to use the loader, it ignores f0 for the 'y' output
        y, _, _, _ = process_audio(path, 0)
        display(Audio(y, rate=48000))
        show_plot(y, "Original Sound")

def run_model(model, model_name):
    with out:
        clear_output()
        pitch = f0_slider.value

        if pitch < 0:
            print("‚ö†Ô∏è WARNING: Negative Pitch! Entering Hallucination Zone... üëª")

        if mode_toggle.value == 'Reconstruct Sound':
            if not file_dd.value:
                print("Please select a file."); return
            print(f"{model_name} is listening to the file and rebuilding it at Pitch={pitch}...")
            path = file_map[file_dd.value]
            y_orig, inp, f0, scale = process_audio(path, pitch)
            y_out = reconstruct_audio(model, inp, f0, scale)
        else:
            print(f"{model_name} is imagining a new sound from scratch at Pitch={pitch}...")
            y_out = generate_from_scratch(model, pitch, scale=100.0)

        display(Audio(y_out, rate=48000))
        show_plot(y_out, f"{model_name} Output")

# Link buttons
btn_orig.on_click(on_orig)
btn_untrained.on_click(lambda b: run_model(vae_novice, "Untrained Robot"))
btn_trained.on_click(lambda b: run_model(vae_pro, "Trained Robot"))

# --- Layout ---
ui = widgets.VBox([
    widgets.HTML("<h3>üéõÔ∏è Control Panel</h3>"),
    mode_toggle,
    widgets.HTML("<br><b>1. Settings:</b>"),
    file_dd,
    f0_slider,
    widgets.HTML("<br><b>2. Actions:</b>"),
    widgets.HBox([btn_orig, btn_untrained, btn_trained]),
    out
])
display(ui)

VBox(children=(HTML(value='<h3>üéõÔ∏è Control Panel</h3>'), ToggleButtons(description='Mode:', options=('Reconstru‚Ä¶