<a href="https://colab.research.google.com/github/kwb425/class-2024-fall/blob/main/class-2024-fall_1206-0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio
!pip install SpeechRecognition
!pip install gtts
from PIL import Image
from gtts import gTTS
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import gradio as gr
import soundfile as sf
import speech_recognition as sr

In [None]:
# Fill mask
fill_mask_model = pipeline(model="bert-base-uncased")

def fill_mask(text):
    results = fill_mask_model(text)
    results_list = [{ 'score': result['score'], 'token_str': result['token_str'], 'sequence': result['sequence']} for result in results]
    return results_list

interface = gr.Interface(fn=fill_mask, inputs="text", outputs="json")
interface.launch()

# The quick brown fox jumps over the deep [MASK].
# In the center of the city, you can find busy [MASK].
# The solar system consists of eight planets, including Earth and [MASK].

In [None]:
## Image explanation
caption_generator = pipeline(model="ydshieh/vit-gpt2-coco-en")

def generate_caption(img):
    pil_img = Image.fromarray(img)
    result = caption_generator(pil_img)[0]['generated_text']
    return result

interface = gr.Interface(fn=generate_caption, inputs="image", outputs="text")
interface.launch()

In [None]:
## Speech-to-Text (STT)
def stt(audio):
    stt_model = sr.Recognizer()
    s_rate, audio_data = audio
    sf.write("stt_input.wav", audio_data, s_rate)

    with sr.AudioFile("stt_input.wav") as source:
        audio_recorded = stt_model.record(source)
        text = stt_model.recognize_google(audio_recorded)
    return text

interface = gr.Interface(fn=stt, inputs="microphone", outputs="text")
interface.launch()


In [None]:
## Text-to-Speech (TTS)
def tts(text):
  tts_model = gTTS(text)
  tts_model.save("tts_output.wav")
  return "tts_output.wav"

interface = gr.Interface(fn=tts, inputs="text", outputs="audio")
interface.launch()


In [None]:
## Voice Chatbot
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
chat_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

def stt(audio):
  stt_model = sr.Recognizer()
  s_rate, audio_data = audio
  sf.write("stt_input.wav", audio_data, s_rate)

  with sr.AudioFile("stt_input.wav") as source:
    audio_recorded = stt_model.record(source)
    text = stt_model.recognize_google(audio_recorded)
  return text

def tts(text):
  tts_model = gTTS(text)
  tts_model.save("tts_output.wav")
  return "tts_output.wav"

# No chat history
def chatbot(text):
  input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
  output_ids = chat_model.generate(input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
  output = tokenizer.decode(output_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
  return output

def voice_chatbot(audio):
  user_text = stt(audio)
  chatbot_text = chatbot(user_text)
  speech_file = tts(chatbot_text)
  return user_text, chatbot_text, speech_file

interface = gr.Interface(
    fn=voice_chatbot,
    inputs="microphone",
    outputs=[
        gr.Textbox(label="User Text"),
        gr.Textbox(label="Chatbot Response"),
        gr.Audio(label="Chatbot Audio")
    ]
)

interface.launch()