# EGN6217 P2/P3 Gradio

In [None]:
!pip install transformers diffusers datasets jiwer gradio

Collecting gradio
  Downloading gradio-5.27.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.9.0 (from gradio)
  Downloading gradio_client-1.9.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [None]:
import os
import re
import ast
import hashlib
import warnings
import random
import requests
import torch
import torch.nn.functional as F
import torchaudio
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import GradScaler, autocast
from torchvision import transforms
from datasets import load_dataset
from tqdm import tqdm
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, CLIPTokenizer
from diffusers import StableDiffusionPipeline
from PIL import Image
from io import BytesIO
import jiwer
import gradio as gr
from IPython.display import clear_output, Audio, display
from google.colab import drive
import textwrap

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# setup Wav2Vec2 environment
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_normalize=True)
wav2vec2_model = Wav2Vec2ForCTC.from_pretrained("/content/drive/MyDrive/Colab Data/egn6217-p3/wav2vec2_model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [None]:
# setup Stable Diffusion v1.5 environment

warnings.filterwarnings("ignore")

# tokenizer, model, loss function
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") # in Stable Diffusion v1.5 architecture, imported separately here for collate_fn
diffusion_pipe = StableDiffusionPipeline.from_pretrained("/content/drive/MyDrive/Colab Data/egn6217-p3/diffusion_pipe", torch_dtype=torch.float32) ######################## torch.float16
loss_fn = F.mse_loss

# Dataset and DataLoader

class FaceCaption15MDataset(Dataset): # dynamic loading for training
    def __init__(self, xs, ts_paths):
        # set img stuff
        self.xs = xs             # img descriptions
        self.ts_paths = ts_paths # img paths

    def __len__(self):
        return len(self.xs)

    def __getitem__(self, idx):
        # load img stuff
        x = self.xs[idx]                   # img description
        t = torch.load(self.ts_paths[idx]) # img
        return x, t

def collate_fn(batch): # dynamic batch-wise tokenization
    xs, ts = zip(*batch)
    xs = tokenizer(xs, padding="longest", truncation=True, max_length=77, return_tensors="pt")
    ts = torch.stack(ts, dim=0)
    return xs, ts

test_dataset = torch.load("/content/drive/MyDrive/Colab Data/egn6217-p3/test_dataset.pth", weights_only=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=8, collate_fn=collate_fn, pin_memory=True) ######################## high n_workers high pin_memory = faster compute but more used memory

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
def make_test_gradio(processor, wav2vec2_model, diffusion_pipe):

  def test_gradio(audio):
      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

      # speech-to-text

      # preprocess audio
      sr, audio = audio
      audio = processor(audio.squeeze().astype(np.float32), padding=True, sampling_rate=16000, return_attention_mask=True, return_tensors="pt")
      audio_att_mask = audio["attention_mask"].to(device)
      audio = audio["input_values"].to(device)

      # forward propagation
      # wav2vec2_model.to(device)
      wav2vec2_model.eval()
      outputs = wav2vec2_model(audio, attention_mask=audio_att_mask)

      # decode predictions
      pred_ids = torch.argmax(outputs.logits, dim=-1)
      pred_transcript = processor.batch_decode(pred_ids, skip_special_tokens=True, group_tokens=True)[0]

      # text-to-image

      # diffusion_pipe.to(device)
      img = diffusion_pipe(prompt=pred_transcript, guidance_scale=15).images[0]

      return pred_transcript, img

  return test_gradio

# pre-move models to GPU and make gradio function w/ models in scope

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wav2vec2_model = wav2vec2_model.to(device)
diffusion_pipe = diffusion_pipe.to(device)

test_gradio = make_test_gradio(processor, wav2vec2_model, diffusion_pipe)

In [None]:
interface = gr.Interface(
    fn=test_gradio,
    inputs=gr.Audio(
        sources=["microphone"],
        type="numpy",
        label="Record your audio",
        waveform_options=gr.WaveformOptions(sample_rate=16000)
    ),
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Image(label="Generated Image")
    ],
    title="Speech-to-Image Generation",
    description="Record audio to generate both a transcript and an AI-generated image from the transcript."
)

# launch the interface
interface.launch(inline=False, inbrowser=True, debug=True)