In [1]:
import sys
from pathlib import Path

root_dir = Path.cwd().parent

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))


1: Record Audio

In [2]:
import sounddevice as sd
import soundfile as sf
import numpy as np

from fluent_flow import logger

In [None]:
def record_audio(duration=5, fs=44100):
    """Record audio for a specified duration."""
    logger.info("Recording will start in 3 seconds...")
    sd.sleep(3000)
    logger.info("Recording started...")
    
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until recording is finished
    
    logger.info("Recording finished.")
    return recording.flatten()

def save_audio(data, filename="output.wav", fs=44100):
    sf.write(filename, data, fs)
    logger.info(f"Audio saved to {filename}")

In [None]:
audio_data = record_audio(duration=10)  # Adjust duration as needed
save_audio(audio_data)

2: Speech to Text

In [None]:
import sys
from pathlib import Path
import json
import wave

root_dir = Path.cwd().parent

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

from fluent_flow import logger

# Make sure you have Vosk installed: pip install vosk
from vosk import Model, KaldiRecognizer

In [None]:


def speech_to_text(audio_file, model_path):
    """
    Convert speech to text using Vosk.
    
    :param audio_file: Path to the audio file
    :param model_path: Path to the Vosk model
    :return: Transcribed text
    """
    logger.info(f"Starting speech-to-text conversion for {audio_file}")
    
    # Check if model path exists
    if not Path(model_path).exists():
        logger.error(f"Model path does not exist: {model_path}")
        return None

    try:
        # Load Vosk model
        model = Model(model_path)
        
        # Open the audio file
        wf = wave.open(str(audio_file), "rb")
        
        # Check if the audio format is compatible
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            logger.error("Audio file must be WAV format mono PCM.")
            return None
        
        # Create recognizer
        rec = KaldiRecognizer(model, wf.getframerate())
        rec.SetWords(True)
        
        # Process audio file
        results = []
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                part_result = json.loads(rec.Result())
                results.append(part_result)
        
        part_result = json.loads(rec.FinalResult())
        results.append(part_result)
        
        # Extract text from results
        text = " ".join([r['text'] for r in results if 'text' in r])
        
        logger.info("Speech-to-text conversion completed")
        return text

    except Exception as e:
        logger.error(f"Error in speech-to-text conversion: {str(e)}")
        return None

In [None]:

audio_file = "output.wav"  # This should be the file saved in step 1
model_path = "vosk-model-small-de-015"  # Replace with the path to your Vosk model

transcribed_text = speech_to_text(audio_file, model_path)

if transcribed_text:
    logger.info(f"Transcribed text: {transcribed_text}")
else:
    logger.error("Failed to transcribe audio")

### Alternative

In [None]:
import openai
from fluent_flow import logger

def speech_to_text_with_whisper(audio_file_path, model="whisper-1"):
    """
    Convert speech to text using OpenAI Whisper.
    
    :param audio_file_path: Path to the audio file
    :param model: Whisper model to use (default is "whisper-1")
    :return: Transcribed text
    """
    logger.info(f"Starting speech-to-text conversion for {audio_file_path}")
    
    try:
        # Open the audio file in binary mode
        with open(audio_file_path, "rb") as audio_file:
            # Transcribe using OpenAI Whisper
            response = openai.Audio.transcriptions.create(
                model=model,
                file=audio_file,
                response_format="text"  # Options: "text", "json", "srt", "verbose_json", "vtt"
            )
        
        transcribed_text = response['text']
        logger.info("Speech-to-text conversion completed")
        return transcribed_text

    except Exception as e:
        logger.error(f"Error in speech-to-text conversion: {str(e)}")
        return None

In [None]:

# Example usage
audio_file = "output.wav" # Path to your WAV audio file
transcribed_text = speech_to_text_with_whisper(audio_file)

if transcribed_text:
    logger.info(f"Transcribed text: {transcribed_text}")
else:
    logger.error("Failed to transcribe audio")

3: Process Text

### openai

In [None]:
from dotenv import load_dotenv
import os

from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Load environment variables
load_dotenv()

In [None]:
def initialize_language_model():
    """Initialize LangChain with OpenAI as the language model."""
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        logger.error("API key is not set in the environment variables.")
        return None
    
    llm = OpenAI(api_key=api_key, model="gpt-3.5-turbo-instruct")
    prompt_template = generate_prompt()
    llm_chain = LLMChain(llm=llm, prompt=prompt_template)
    logger.info("LLMChain initialized with OpenAI.")
    return llm_chain

def generate_prompt():
    """Define the prompt template for the language model."""
    template = """
    Du bist ein hilfreicher Deutschlehrer. Deine Aufgabe ist es, die Eingaben des Benutzers auf Deutsch kurz und prägnant zu korrigieren und eine einfache Erklärung für die Korrektur zu geben. Halte die Konversation lebendig, indem du relevante Kommentare oder Vorschläge machst.

    Hier ist ein Beispiel, wie du antworten solltest:
    Mensch: Ich habe heute ein neues Buch gekauft.
    KI: Korrektur: "Ich habe heute ein neues Buch gekauft." (I bought a new book today.) 
    Erklärung: Der Satz ist korrekt! Bücher sind eine großartige Möglichkeit, neue Ideen zu entdecken. Welches Buch hast du gekauft?

    Aktuelles Gespräch:
    {chat_history}
    Mensch: {input_text}
    KI: Korrektur und Kommentar:
    """
    return PromptTemplate(template=template, input_variables=["chat_history", "input_text"])

def process_text(llm_chain, input_text, chat_history):
    """Process the input text using the configured LLMChain."""
    logger.info("Processing text with LLMChain...")
    try:
        # Prepare input variables
        input_variables = {
            "chat_history": chat_history,
            "input_text": input_text
        }
        
        # Debugging: Log the input variables
        logger.info(f"Input Variables: {input_variables}")

         # Generate the prompt using the PromptTemplate
        prompt = llm_chain.prompt.format(**input_variables)
        
        # Debugging: Log the generated prompt
        logger.info(f"Generated Prompt: {prompt}")

        response = llm_chain.run(**input_variables)
        logger.info("Text processing completed")
        return response.strip()
    except Exception as e:
        logger.error(f"Error in text processing: {str(e)}")
        return None
    
def update_chat_history(chat_history, new_human_text, new_ai_text, max_turns=3):
    """Update chat history to keep only the last few exchanges."""
    chat_history += f"Human: {new_human_text}\nAI: {new_ai_text}\n"
    # Split the chat history into turns
    turns = chat_history.strip().split('\n')
    # Keep only the last `max_turns` exchanges
    if len(turns) > max_turns * 2:
        turns = turns[-max_turns * 2:]
    return '\n'.join(turns)


In [None]:
llm_chain = initialize_language_model()
    
if llm_chain:
    transcribed_text = "Moin! Wie geht es dir?"
    chat_history = ""
    
    processed_text = process_text(llm_chain, transcribed_text, chat_history)
    if processed_text:
        logger.info(f"Processed text: {processed_text}")
        chat_history = update_chat_history(chat_history, transcribed_text, processed_text)
    else:
        logger.error("Failed to process text")

In [None]:
transcribed_text = "Auch gut, danke! Ja ich lebe ich Hamburg seit ne weile und 'Moin' ist mir beigrbracht"
processed_text = process_text(llm_chain, transcribed_text, chat_history)
if processed_text:
    logger.info(f"Processed text: {processed_text}")
    chat_history = update_chat_history(chat_history, transcribed_text, processed_text)
else:
    logger.error("Failed to process text")

4: Text to Speech
5: Play Audio

In [None]:
from gtts import gTTS
from pydub import AudioSegment
import os

def text_to_speech(text, language='de', mp3_filename='output.mp3', wav_filename='output.wav'):
    """Convert text to speech in German and save as a WAV file."""
    try:
        # Convert text to speech and save as MP3
        tts = gTTS(text=text, lang=language)
        tts.save(mp3_filename)
        logger.info(f"Text-to-speech conversion completed and saved to {mp3_filename}")
        
        # Convert MP3 to WAV
        audio = AudioSegment.from_mp3(mp3_filename)
        audio.export(wav_filename, format='wav')
        logger.info(f"Audio converted to WAV and saved to {wav_filename}")
        
        # Optionally, remove the MP3 file if you only want the WAV
        os.remove(mp3_filename)

    except Exception as e:
        logger.error(f"Error in text-to-speech conversion: {str(e)}")

In [None]:
from pydub import AudioSegment
from pydub.playback import play

def play_audio(filename='output.wav'):
    """Play the audio file."""
    try:
        audio = AudioSegment.from_wav(filename)
        play(audio)
        logger.info(f"Playing audio file {filename}")
    except Exception as e:
        logger.error(f"Error playing audio file: {str(e)}")

In [None]:
# After processing text
transcribed_text = "Also hamburg, bremen. Ich war auch inn Sylt"
processed_text = process_text(llm_chain, transcribed_text, chat_history)
if processed_text:
    logger.info(f"Processed text: {processed_text}")
    chat_history = update_chat_history(chat_history, transcribed_text, processed_text)
    
    # Convert processed text to speech in German and save as WAV
    text_to_speech(processed_text, wav_filename='response.wav')
    
    # Play the generated speech
    play_audio('response.wav')
else:
    logger.error("Failed to process text")

### Conversation

In [None]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer
import openai
from fluent_flow import logger
from langchain import OpenAI, LLMChain, PromptTemplate
import sounddevice as sd
import numpy as np
import queue
import sys

# Function to record audio
def record_audio(filename, duration=10, samplerate=16000):
    q = queue.Queue()

    def callback(indata, frames, time, status):
        if status:
            print(status, file=sys.stderr)
        q.put(indata.copy())

    with sd.InputStream(samplerate=samplerate, channels=1, callback=callback):
        print(f"Recording for {duration} seconds...")
        with wave.open(filename, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(samplerate)
            for _ in range(int(samplerate / 1024 * duration)):
                wf.writeframes(q.get())

def start_conversation(vosk_model_path, openai_model="whisper-1"):
    """
    Start a conversation using speech-to-text, language model processing, and text-to-speech.
    
    :param vosk_model_path: Path to the Vosk model for speech recognition
    :param openai_model: OpenAI Whisper model to use for speech-to-text
    """
    # Initialize the language model
    llm_chain = initialize_language_model()
    if llm_chain is None:
        logger.error("Failed to initialize language model.")
        return

    chat_history = ""
    
    while True:
        # Record audio input from the user
        audio_file_path = "user_input.wav"
        record_audio(audio_file_path, duration=5)

        # Convert speech to text
        transcribed_text = speech_to_text(audio_file_path, vosk_model_path)
        if not transcribed_text:
            logger.error("Failed to transcribe audio.")
            continue

        # Process the text with the language model
        ai_response = process_text(llm_chain, transcribed_text, chat_history)
        if not ai_response:
            logger.error("Failed to process text.")
            continue

        # Update chat history
        chat_history = update_chat_history(chat_history, transcribed_text, ai_response)

        # Convert AI response to speech
        text_to_speech(ai_response)

        # Log the conversation
        logger.info(f"Human: {transcribed_text}")
        logger.info(f"AI: {ai_response}")

        # Break condition (optional)
        if "exit" in transcribed_text.lower():
            logger.info("Exiting conversation.")
            break

In [None]:
vosk_model_path = "vosk-model-small-de-015"
start_conversation(vosk_model_path)