In [1]:
!pip install gradio ultralytics git+https://github.com/suno-ai/bark.git

Collecting git+https://github.com/suno-ai/bark.git
  Cloning https://github.com/suno-ai/bark.git to /tmp/pip-req-build-nfhkqjn1
  Running command git clone --filter=blob:none --quiet https://github.com/suno-ai/bark.git /tmp/pip-req-build-nfhkqjn1
  Resolved https://github.com/suno-ai/bark.git to commit f4f32d4cd480dfec1c245d258174bc9bde3c2148
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting gradio
  Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)
Collecting ultralytics
  Downloading ultralytics-8.3.54-py3-none-any.whl.metadata (35 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
C

***Import Necessary Libraries***

In [2]:
import cv2
import torch
import numpy as np
from ultralytics import YOLO
import gradio as gr
from bark import SAMPLE_RATE, generate_audio, preload_models
from scipy.io.wavfile import write as write_wav
import os
import time
import random
import json

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


***Function to load medical phrases from JSON***

In [3]:
def load_medical_phrases(file_path):
    """
    Load medical phrases from a JSON file
    """
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading medical phrases: {e}")
        return {}

***Function to generate speech using Bark***

In [4]:
def generate_speech(text, audio_dir="generated_audio"):
    """
    Generate speech from text using Bark
    """
    if not text:
        return None

    audio_array = generate_audio(
        text,
        history_prompt="v2/en_speaker_6",
        text_temp=0.5,
        waveform_temp=0.6
    )

    timestamp = int(time.time())
    os.makedirs(audio_dir, exist_ok=True)
    audio_path = os.path.join(audio_dir, f"speech_{timestamp}.wav")
    write_wav(audio_path, SAMPLE_RATE, audio_array)
    return audio_path

***ASL Medical Recognizer Class***

In [5]:
class ASLMedicalRecognizer:
    def __init__(self, model_path, confidence_threshold=0.5, buffer_size=5):
        """
        Initialize the ASL Medical Word Recognizer with contextual TTS
        """
        self.model = YOLO(model_path)
        self.confidence_threshold = confidence_threshold
        self.labels = [
            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
            'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
        ]

        self.medical_phrases = load_medical_phrases("/content/phrases.json")
        self.medical_words = set(self.medical_phrases.keys())
        self.prediction_buffer = []
        self.buffer_size = buffer_size

        # Initialize Bark TTS
        print("Loading Bark TTS models...")
        preload_models()
        print("Bark TTS models loaded successfully!")

    def detect(self, frame):
        """
        Perform detection on a single frame
        """
        results = self.model.predict(
            source=frame,
            conf=self.confidence_threshold,
            device=0 #if GPU available
        )

        detected_letters = []
        for result in results:
            boxes = result.boxes
            if len(boxes) > 0:
                for box in boxes:
                    cls = int(box.cls[0])
                    conf = float(box.conf[0])
                    if conf > self.confidence_threshold:
                        detected_letter = self.labels[cls]
                        detected_letters.append(detected_letter)

        annotated_frame = results[0].plot() if results else frame
        return annotated_frame, detected_letters

    def find_potential_medical_words(self, letters):
        """
        Find potential medical words from a sequence of letters
        """
        prefix = ''.join(letters).lower()

        if not prefix:
            return []

        potential_matches = [
            word for word in self.medical_words
            if word.startswith(prefix) and len(word) >= len(prefix)
        ]

        return potential_matches[:5]

    def get_contextual_phrase(self, word):
        """
        Get a contextual phrase for the selected medical word
        """
        if word in self.medical_phrases:
            return random.choice(self.medical_phrases[word])
        return word

    def process_captured_image(self, image):
        """
        Process a captured image and return results
        """
        annotated_frame, detected_letters = self.detect(image)

        if detected_letters:
            self.prediction_buffer.extend(detected_letters)

            if len(self.prediction_buffer) > self.buffer_size * 3:
                self.prediction_buffer = self.prediction_buffer[-self.buffer_size * 3:]

            potential_words = self.find_potential_medical_words(self.prediction_buffer)

            # Get phrases for all potential words
            word_phrases = {word: self.get_contextual_phrase(word) for word in potential_words}

            return annotated_frame, detected_letters, potential_words, word_phrases

        return annotated_frame, [], [], {}

    def reset_buffers(self):
        """
        Reset prediction buffer
        """
        self.prediction_buffer.clear()

***Function to create the Gradio interface***

In [6]:
def create_asl_interface(model_path):
    """
    Create Gradio interface for ASL medical communication
    """
    recognizer = ASLMedicalRecognizer(model_path)

    def webcam_predict(image):
        """
        Process webcam input and update interface components
        """
        if image is None:
            return None, "No image captured", [], gr.Dropdown(choices=[]), gr.Dropdown(choices=[])

        annotated_frame, detected_letters, potential_words, word_phrases = recognizer.process_captured_image(image)
        detection_text = f"Detected Letters: {', '.join(detected_letters)}"

        word_choices = potential_words if potential_words else []
        word_dropdown = gr.Dropdown(
            choices=word_choices,
            value=word_choices[0] if word_choices else None,
            interactive=True,
            label="Select the detected word"
        )

        phrase_dropdown = gr.Dropdown(
            choices=[],
            value=None,
            interactive=True,
            label="Select the phrase to speak"
        )

        return annotated_frame, detection_text, word_dropdown, phrase_dropdown

    def update_phrases(selected_word):
        """
        Update phrase dropdown based on the selected word
        """
        if selected_word:
            phrases = recognizer.medical_phrases.get(selected_word, [])
            return gr.Dropdown(
                choices=phrases,
                value=phrases[0] if phrases else None,
                interactive=True,
                label="Select the phrase to speak"
            )
        return gr.Dropdown(choices=[], interactive=True, label="Select the phrase to speak")

    def speak_selected_phrase(selected_phrase):
        """
        Generate speech for the selected phrase
        """
        if selected_phrase:
            audio_path = generate_speech(selected_phrase)
            return selected_phrase, audio_path
        return "", None

    def reset_recognition():
        """
        Reset all components
        """
        recognizer.reset_buffers()
        empty_word_dropdown = gr.Dropdown(choices=[], interactive=True, label="Select the detected word")
        empty_phrase_dropdown = gr.Dropdown(choices=[], interactive=True, label="Select the phrase to speak")
        return "Buffers reset. Ready for new recognition.", None, empty_word_dropdown, empty_phrase_dropdown

    with gr.Blocks() as demo:
        gr.Markdown("# ASL Medical Communication Assistant")
        gr.Markdown("Capture ASL signs to communicate medical needs and requests")

        with gr.Row():
            webcam = gr.Image(sources=["webcam"])

            with gr.Column():
                detection_output = gr.Textbox(label="Detection Results")
                word_dropdown = gr.Dropdown(choices=[], label="Select the detected word", interactive=True)
                phrase_dropdown = gr.Dropdown(choices=[], label="Select the phrase to speak", interactive=True)
                audio_output = gr.Audio(label="Speech Output")

        with gr.Row():
            capture_btn = gr.Button("Capture Sign")
            update_btn = gr.Button("Update Phrases")
            speak_btn = gr.Button("Speak Selected Phrase", interactive=True)
            reset_btn = gr.Button("Reset")

        final_phrase = gr.Textbox(label="Selected Phrase")

        capture_btn.click(
            webcam_predict,
            inputs=webcam,
            outputs=[webcam, detection_output, word_dropdown, phrase_dropdown]
        )

        update_btn.click(
            update_phrases,
            inputs=word_dropdown,
            outputs=phrase_dropdown
        )

        speak_btn.click(
            speak_selected_phrase,
            inputs=phrase_dropdown,
            outputs=[final_phrase, audio_output]
        )

        reset_btn.click(
            reset_recognition,
            outputs=[detection_output, audio_output, word_dropdown, phrase_dropdown]
        )

    return demo

In [8]:
def main():
    model_path = '/content/ASL-medium-20-epoch.pt'
    demo = create_asl_interface(model_path)
    demo.launch(debug=True)

if __name__ == '__main__':
    main()

Loading Bark TTS models...
Bark TTS models loaded successfully!
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://2bbf9d00df49522cf5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



0: 384x640 1 A, 71.2ms
Speed: 2.3ms preprocess, 71.2ms inference, 78.7ms postprocess per image at shape (1, 3, 384, 640)


  with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
100%|██████████| 305/305 [00:12<00:00, 23.56it/s]
100%|██████████| 16/16 [00:38<00:00,  2.38s/it]


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://2bbf9d00df49522cf5.gradio.live
