In [1]:
!pip install faster-whisper ffmpeg-python webrtcvad edge-tts asyncio

Collecting faster-whisper
  Downloading faster_whisper-1.0.3-py3-none-any.whl.metadata (15 kB)
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting webrtcvad
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting edge-tts
  Downloading edge_tts-6.1.12-py3-none-any.whl.metadata (4.0 kB)
Collecting asyncio
  Downloading asyncio-3.4.3-py3-none-any.whl.metadata (1.7 kB)
Collecting av<13,>=11.0 (from faster-whisper)
  Downloading av-12.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.19.0-cp3

In [3]:
!sudo apt-get install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh # download ollama api
from IPython.display import clear_output

# Create a Python script to start the Ollama API server in a separate thread

import os
import threading
import subprocess
import requests
import json

def ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
pciutils is already the newest version (1:3.7.0-6).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 CLI
############################################################################################# 100.0%
>>> Making ollama accessible in the PATH in /usr/local/bin
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [4]:

from IPython.display import clear_output
!ollama pull llama3.1:8b
clear_output()

!pip install -U lightrag[ollama]



In [5]:
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
import os
import ffmpeg
import webrtcvad
import wave
from faster_whisper import WhisperModel
from lightrag.core.generator import Generator
from lightrag.core.component import Component
from lightrag.core.model_client import ModelClient
from lightrag.components.model_client import OllamaClient, GroqAPIClient
import time
import asyncio
import random
import edge_tts
from edge_tts import VoicesManager

In [8]:
RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=5):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  audio_path = 'audio.wav'
  with open(audio_path,'wb') as f:
    f.write(b)
  return audio_path

def preprocess_audio(input_file, output_file):
    stream = ffmpeg.input(input_file)
    stream = ffmpeg.output(stream, output_file, ar=16000, ac=1)
    ffmpeg.run(stream)
    vad_filter('processed_audio.wav', 'vad_filtered_audio.wav')

def vad_filter(input_file, output_file, vad_threshold=0.5):
    vad = webrtcvad.Vad()
    vad.set_mode(2)  # 0: Aggressive VAD, 3: Very Aggressive

    with wave.open(input_file, 'rb') as wf:
        with wave.open(output_file, 'wb') as out:
            out.setnchannels(wf.getnchannels())
            out.setsampwidth(wf.getsampwidth())
            out.setframerate(wf.getframerate())

            frame_size = 160  # 10 ms at 16kHz
            frame = wf.readframes(frame_size)
            while frame:
                if vad.is_speech(frame, wf.getframerate()):
                    out.writeframes(frame)
                frame = wf.readframes(frame_size)
    transcript('vad_filtered_audio.wav')

def transcript(audio_file):
    model_size = "medium"
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_file, beam_size=5)
    prompt= ""
    for segment in segments:
        prompt += segment.text + " "

    # Optional: Trim any extra space at the end of the prompt
    prompt = prompt.strip()
    print(prompt)
    return prompt

def llama(prompt):
    qa_template = r"""
    You are a helpful assistant.

    User: {{input_str}}
    You:"""

    class SimpleQA(Component):
        def __init__(self, model_client: ModelClient, model_kwargs: dict):
            super().__init__()
            self.generator = Generator(
                model_client=model_client,
                model_kwargs=model_kwargs,
                template=qa_template,
            )

        def call(self, input: dict) -> str:
            return self.generator.call({"input_str": str(input)})

        async def acall(self, input: dict) -> str:
            return await self.generator.acall({"input_str": str(input)})

    model = {
        "model_client": OllamaClient(),
        "model_kwargs": {"model": "llama3.1:8b"}
    }
    qa = SimpleQA(**model)
    output=qa(f"{prompt} Respond strictly in two sentences only.")
    response = output.data
    display(f"**Answer:** {response}")
    return response

async def tts(text):
    # Generate male and female TTS responses from input text and save them as audio files.
    voices = await VoicesManager.create()  # Await the asynchronous creation of VoicesManager

    # Find male and female voices
    male_voice = voices.find(Gender="Male", Language="en")
    female_voice = voices.find(Gender="Female", Language="en")

    # Generate male voice response
    communicate_male = edge_tts.Communicate(text, random.choice(male_voice)["Name"], rate="-10%", pitch="-10Hz")
    await communicate_male.save("male_response.mp3")

    # Generate female voice response
    communicate_female = edge_tts.Communicate(text, random.choice(female_voice)["Name"], rate="-10%", pitch="-10Hz")
    await communicate_female.save("female_response.mp3")

def main():
    # Prompt the user to choose recording or file path input
    choice = input("Do you want to record from the microphone or enter an audio file path? (Enter 'record' or 'audio_path'): ").strip().lower()

    if choice == 'record':
        print("Recording from microphone...")
        audio_path = record()  # Record for 5 seconds (default)
        print(f"Audio recorded and saved to {audio_path}")
    elif choice == 'audio_path':
        audio_path = input("Please enter the path to the audio file: ").strip()
        if not os.path.exists(audio_path):
            print("The file does not exist. Please check the path and try again.")
            return
        print(f"Using the provided audio file: {audio_path}")
    else:
        print("Invalid choice. Please enter 'record' or 'path'.")
        return

    # Preprocess the audio file
    prompt = preprocess_audio(audio_path, 'processed_audio.wav')
    response = llama(prompt)
    print(response)

    for i in ['processed_audio.wav', 'vad_filtered_audio.wav']:
        os.remove(i)
    return response

# Run the main function
response = main()

#Text To Speech Conversion
await tts(response)

Do you want to record from the microphone or enter an audio file path? (Enter 'record' or 'audio_path'): record
Recording from microphone...


<IPython.core.display.Javascript object>

Audio recorded and saved to audio.wav


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.26k [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

say about medicinal plants.


"**Answer:** I'm here to assist you with any questions or tasks you may have. What can I help you with today?"

I'm here to assist you with any questions or tasks you may have. What can I help you with today?
