<a href="https://colab.research.google.com/github/karaage0703/stable-diffusion-colab-tools/blob/main/006_whisper_voice_recognition_stable_diffusion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Whisper Voice Recognition Stable Diffusion

Reference:
- https://gist.github.com/tam17aki/8bfa2a42dab0061ee2641aed32dd1d30
- https://zenn.dev/karaage0703/articles/d47bbb085fcb83

## Record audio from microphone

In [None]:
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode

RECORD = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec, filename='audio.wav'):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec * 1000))
  b = b64decode(s.split(',')[1])
  with open(filename, 'wb+') as f:
    f.write(b)

In [None]:
audiofile = "input.wav"
second = 3
print(f"Speak to your microphone {second} sec...")
record(second, audiofile)
print("Done!")

In [None]:
# Ignore "UserWarning: PySoundFile failed. Trying audioread instead."
import librosa
import librosa.display
speech, rate = librosa.load(audiofile, sr=16000)
librosa.display.waveplot(speech, sr=rate)

## Voice recognition Test

Install Whisper

In [None]:
!pip install -qq git+https://github.com/openai/whisper.git

Test Whisper

In [None]:
import whisper
whisper_model = whisper.load_model('small')
#whisper_model = whisper.load_model('base')

result = whisper_model.transcribe('input.wav', verbose=True, language='ja', task='translate')
print(result['text'])

## Stable Diffusion Test

### Hugging Face Login

In [None]:
!pip -qq install diffusers==0.6.0
!pip -qq install transformers
!pip install -qq tqdm

from huggingface_hub import notebook_login
notebook_login()

### Setup

In [None]:
import torch
from diffusers import StableDiffusionPipeline
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt

fig = plt.figure(figsize=(10,10))

device = "cuda"
model_id = "CompVis/stable-diffusion-v1-4"

pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
    revision="fp16",
    torch_dtype=torch.float16,
    use_auth_token=True,
).to(device)

### Generate Image

In [None]:
prompt = result['text']
seed_number = 42
num_inference_steps  = 20
guidance_scale_value = 7.5
width_image = 512
height_image = 512

def infer(prompt, seed_number, num_inference_steps, guidance_scale_value, width_image, height_image):
    generator = torch.Generator(device=device)
    latents = None

    # Get a new random seed, store it and use it as the generator state
    if seed_number < 0:
        seed = generator.seed()
    else:
        seed = seed_number

    generator = generator.manual_seed(seed)

    image_latent = torch.randn(
        (1, pipe.unet.in_channels, height_image // 8, width_image // 8),
        generator = generator,
        device = device
    )

    with torch.autocast('cuda'):
        image = pipe(
            [prompt],
            width=width_image,
            height=height_image,
            guidance_scale=guidance_scale_value,
            num_inference_steps=num_inference_steps,
            latents = image_latent
        ).images[0]

    return image

def draw_image(image):
    fig = plt.figure(figsize=(10,10))
    plt.imshow(image)
    plt.axis('off')
    plt.show()

image = infer(prompt, seed_number, num_inference_steps, guidance_scale_value, width_image, height_image)

draw_image(image)

## Whisper Voice Recognition Stable Diffusion

Voice Recognition

In [None]:
audiofile = "input.wav"
second = 3
print(f"Speak to your microphone {second} sec...")
record(second, audiofile)
print("Done!")

result = whisper_model.transcribe('input.wav', verbose=True, language='ja', task='translate')
print(result['text'])

In [None]:
#@title **Whisper Voice Recognition Stable Diffusion**
#@markdown　Enter Parameter  (Attention: Seed=-1 is random)

prompt = result['text']
seed_number = 42 #@param
num_inference_steps  = 20 #@param {type:"slider", min:1, max:200, step:1}
guidance_scale_value = 7.5 #@param {type:"slider", min:1, max:20, step:0.1}
width_image = 512 #@param {type:"slider", min:60, max:640, step:8}
height_image = 512 #@param {type:"slider", min:60, max:640, step:8}

image = infer(prompt, seed_number, num_inference_steps, guidance_scale_value, width_image, height_image)

draw_image(image)