diff --git a/README.md b/README.md index bc1f473..c5a0115 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ You can also ask Bob about available tools: Detailed examples of how to interact with Bob are given in these notebooks: * [Basic usage](https://github.com/haesleinhuepf/bia-bob/blob/main/demo/basic_demo.ipynb) +* [Speech recognition](https://github.com/haesleinhuepf/bia-bob/blob/main/demo/speech_recognition.ipynb) * [Complete Bio-image Analysis Workflow](https://github.com/haesleinhuepf/bia-bob/blob/main/demo/complete_workflow.ipynb) * [Accessing variables](https://github.com/haesleinhuepf/bia-bob/blob/main/demo/globals.ipynb) * [Image Filtering](https://github.com/haesleinhuepf/bia-bob/blob/main/demo/image_filtering.ipynb) diff --git a/demo/speech_recognition.ipynb b/demo/speech_recognition.ipynb new file mode 100644 index 0000000..1353d4e --- /dev/null +++ b/demo/speech_recognition.ipynb @@ -0,0 +1,189 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a0e02d10-d5cb-4dc1-afb5-744209b1bc1e", + "metadata": {}, + "source": [ + "# Voice recognition\n", + "\n", + "Bob can also listen to commands and execute them. This is powered by the [SpeechRecognition](https://github.com/Uberi/speech_recognition#readme) Python library." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6683e7c1-cfc6-4ad6-9155-75f268ab7303", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from bia_bob import bob\n", + "\n", + "from skimage.io import imread" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3da08d01-fabe-410f-baa5-3eb80e66849a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "image = imread('blobs.tif')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bda0f4dd-8104-44dd-a937-ba7867a6301a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "bob.initialize(globals())" + ] + }, + { + "cell_type": "markdown", + "id": "f9fbb587-d413-47da-be8a-45ab96c0f99f", + "metadata": {}, + "source": [ + "After executing the next cell, say to your microphone something like \"Segment the image and show the results.\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "63fd4ae0-0709-4ee1-8bf4-b09d5752dc3d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Listening...\n", + "You said: segment the Image and show the results\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
shape(254, 256)
dtypeint32
size254.0 kB
min0
max64
\n", + "\n", + "
" + ], + "text/plain": [ + "StackViewNDArray([[0, 0, 0, ..., 4, 4, 4],\n", + " [0, 0, 0, ..., 4, 4, 4],\n", + " [0, 0, 0, ..., 4, 4, 4],\n", + " ...,\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0]])" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "The image has been segmented and the result is shown as segmented_image.jpg." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bob.listen()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d9daecfe-246c-4ec7-9fa1-f620d2f83414", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Listening...\n", + "You said: how many objects are there in the segmented image\n" + ] + }, + { + "data": { + "text/markdown": [ + "There are 64 objects in the segmented image." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bob.listen()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90373f38-a66f-4c31-92dd-45179afff022", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/setup.cfg b/setup.cfg index a95a731..749816e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,6 +39,8 @@ install_requires = napari-skimage-regionprops pandas seaborn + SpeechRecognition + PyAudio python_requires = >=3.8 include_package_data = True diff --git a/src/bia_bob/__init__.py b/src/bia_bob/__init__.py index 7b2773a..d1011a0 100644 --- a/src/bia_bob/__init__.py +++ b/src/bia_bob/__init__.py @@ -4,7 +4,9 @@ ) from ._machinery import bob, init_assistant, add_function_tool +from ._speech_recognition import _listen bob.initialize = init_assistant bob.add_function_tool = add_function_tool bob.__version__ = __version__ +bob.listen = _listen diff --git a/src/bia_bob/_speech_recognition.py b/src/bia_bob/_speech_recognition.py new file mode 100644 index 0000000..422f8f7 --- /dev/null +++ b/src/bia_bob/_speech_recognition.py @@ -0,0 +1,38 @@ +def _listen(): + """ + Activate the microphone and listen to the user. + The passed command is then executed. + """ + result = _listen_to_microphone() + if result: + print("You said:", result) + + from ._machinery import bob + bob(result) + + +def _listen_to_microphone(): + """Recognizes speech from microphone and return it as string""" + import speech_recognition as sr + + # Initialize the recognizer + recognizer = sr.Recognizer() + + with sr.Microphone() as source: + # Reducing the noise + recognizer.adjust_for_ambient_noise(source) + print("Listening...") + audio = recognizer.listen(source) + + try: + # Recognize the content + text = recognizer.recognize_google(audio) + return text + except sr.UnknownValueError: + print("Could not understand audio.") + return None + except sr.RequestError as e: + print("Error calling the API; {0}".format(e)) + return None + +