In [1]:
import sys
from pathlib import Path

root_dir = Path.cwd().parent

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))


1: Record Audio

In [2]:
import sounddevice as sd
import soundfile as sf
import numpy as np

from src.fluent_flow import logger

In [3]:
def record_audio(duration=5, fs=44100):
    """Record audio for a specified duration."""
    logger.info("Recording will start in 3 seconds...")
    sd.sleep(3000)
    logger.info("Recording started...")
    
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until recording is finished
    
    logger.info("Recording finished.")
    return recording.flatten()

def save_audio(data, filename="output.wav", fs=44100):
    sf.write(filename, data, fs)
    logger.info(f"Audio saved to {filename}")

In [4]:
audio_data = record_audio(duration=10)  # Adjust duration as needed
save_audio(audio_data)

[2024-08-08 10:42:38,353: INFO: 3822744899: Recording will start in 3 seconds...]
[2024-08-08 10:42:41,356: INFO: 3822744899: Recording started...]
[2024-08-08 10:42:51,684: INFO: 3822744899: Recording finished.]
[2024-08-08 10:42:51,692: INFO: 3822744899: Audio saved to output.wav]


2: Speech to Text

In [5]:
import sys
from pathlib import Path
import json
import wave

root_dir = Path.cwd().parent

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

from src.fluent_flow import logger

# Make sure you have Vosk installed: pip install vosk
from vosk import Model, KaldiRecognizer

In [6]:


def speech_to_text(audio_file, model_path):
    """
    Convert speech to text using Vosk.
    
    :param audio_file: Path to the audio file
    :param model_path: Path to the Vosk model
    :return: Transcribed text
    """
    logger.info(f"Starting speech-to-text conversion for {audio_file}")
    
    # Check if model path exists
    if not Path(model_path).exists():
        logger.error(f"Model path does not exist: {model_path}")
        return None

    try:
        # Load Vosk model
        model = Model(model_path)
        
        # Open the audio file
        wf = wave.open(str(audio_file), "rb")
        
        # Check if the audio format is compatible
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            logger.error("Audio file must be WAV format mono PCM.")
            return None
        
        # Create recognizer
        rec = KaldiRecognizer(model, wf.getframerate())
        rec.SetWords(True)
        
        # Process audio file
        results = []
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                part_result = json.loads(rec.Result())
                results.append(part_result)
        
        part_result = json.loads(rec.FinalResult())
        results.append(part_result)
        
        # Extract text from results
        text = " ".join([r['text'] for r in results if 'text' in r])
        
        logger.info("Speech-to-text conversion completed")
        return text

    except Exception as e:
        logger.error(f"Error in speech-to-text conversion: {str(e)}")
        return None

In [7]:

audio_file = "output.wav"  # This should be the file saved in step 1
model_path = "vosk-model-small-de-015"  # Replace with the path to your Vosk model

transcribed_text = speech_to_text(audio_file, model_path)

if transcribed_text:
    logger.info(f"Transcribed text: {transcribed_text}")
else:
    logger.error("Failed to transcribe audio")

[2024-08-08 10:44:31,667: INFO: 358404966: Starting speech-to-text conversion for output.wav]


LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from vosk-model-small-de-015/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from vosk-model-small-de-015/graph/HCLr.fst vosk-model-small-de-015/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo vosk-model-small-de-015/graph/phones/word_boundary.int


[2024-08-08 10:44:33,803: INFO: 358404966: Speech-to-text conversion completed]
[2024-08-08 10:44:33,922: INFO: 2640530983: Transcribed text: er mehr geliebte den bus aber an diesem morgen zögerte sich in nein zu klettern als morgen nochmal zum haben]


3: Process Text

In [15]:
import sys
from pathlib import Path
import torch

root_dir = Path.cwd().parent

if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

from src.fluent_flow import logger

from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM,  AutoModelForCausalLM

In [25]:
def initialize_language_model():
    """Initialize the lightweight language model and memory."""
    # Initialize the Hugging Face model
    model_name = "EleutherAI/gpt-neo-1.3B"  # or "EleutherAI/gpt-j-6B" for a larger model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Use GPU if available
    device = 0 if torch.cuda.is_available() else -1
    
    model_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=device,
        do_sample=True,
        temperature=0.7,
    )

    # Define the prompt template
    template = """
    Du bist ein hilfreicher Deutschlehrer. Deine Aufgabe ist es, ein Gespräch auf Deutsch zu führen, 
    Fehler im Input des Benutzers zu korrigieren und kurze Erklärungen für die Korrekturen zu geben.
    Antworte immer auf Deutsch, aber füge englische Übersetzungen in Klammern für wichtige Phrasen oder Korrekturen hinzu.

    Aktuelles Gespräch:
    {chat_history}

    Mensch: {human_input}
    KI: Lass uns unser Gespräch auf Deutsch fortsetzen und eventuelle Fehler korrigieren:
    """
    
    prompt = PromptTemplate(template=template, input_variables=["chat_history", "human_input"])
    
    # Create the language model chain
    return LLMChain(llm=HuggingFacePipeline(pipeline=model_pipeline), prompt=prompt)

def process_text(llm_chain, input_text, chat_history):
    """Process the input text using the language model chain."""
    logger.info("Processing text with language model...")
    try:
        response = llm_chain.predict(human_input=input_text, chat_history=chat_history, max_new_tokens=256)
        logger.info("Text processing completed")
        return response.strip()
    except Exception as e:
        logger.error(f"Error in text processing: {str(e)}")
        return None

In [29]:
def initialize_language_model():
    """Initialize the lightweight language model and memory."""
    # Initialize the Hugging Face model
    model_name = "EleutherAI/gpt-neo-1.3B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Use GPU if available
    device = 0 if torch.cuda.is_available() else -1
    
    model_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=device,
        do_sample=True,
        temperature=0.2,
    )

    return model_pipeline

def process_text(model_pipeline, input_text, chat_history):
    """Process the input text using the language model pipeline."""
    logger.info("Processing text with language model...")
    
    prompt = f"""
    Du bist ein hilfreicher Deutschlehrer. Deine Aufgabe ist es, ein Gespräch auf Deutsch zu führen, 
    Fehler im Input des Benutzers zu korrigieren und kurze Erklärungen für die Korrekturen zu geben.
    Antworte immer auf Deutsch, aber füge englische Übersetzungen in Klammern für wichtige Phrasen oder Korrekturen hinzu.

    Aktuelles Gespräch:
    {chat_history}

    Mensch: {input_text}
    KI: Lass uns unser Gespräch auf Deutsch fortsetzen und eventuelle Fehler korrigieren:
    """
    
    try:
        response = model_pipeline(
            prompt,
            max_new_tokens=256,  # Number of tokens to generate
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )
        text_response = response[0]['generated_text'].replace(prompt, "").strip()
        logger.info("Text processing completed")
        return text_response
    except Exception as e:
        logger.error(f"Error in text processing: {str(e)}")
        return None

In [30]:
llm_chain = initialize_language_model()
    
# Example transcribed text from step 2
transcribed_text = "Ich bin mude und ich mochte schlafen gehen"

# Initialize chat history
chat_history = ""

# Process the text
processed_text = process_text(llm_chain, transcribed_text, chat_history)

if processed_text:
    logger.info(f"Processed text: {processed_text}")
    # Update chat history
    chat_history += f"Human: {transcribed_text}\nAI: {processed_text}\n"
else:
    logger.error("Failed to process text")



[2024-08-08 11:10:18,093: INFO: 1451791682: Processing text with language model...]
[2024-08-08 11:12:10,972: INFO: 1451791682: Text processing completed]
[2024-08-08 11:12:10,973: ERROR: 1152501599: Failed to process text]


In [24]:
###### Test layer

from transformers import AutoTokenizer

# Initialize the tokenizer
model_name = "EleutherAI/gpt-neo-1.3B"  # or any other model you're using
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Your input text
input_text = """
Du bist ein hilfreicher Deutschlehrer. Deine Aufgabe ist es, ein Gespräch auf Deutsch zu führen, 
Fehler im Input des Benutzers zu korrigieren und kurze Erklärungen für die Korrekturen zu geben.
Antworte immer auf Deutsch, aber füge englische Übersetzungen in Klammern für wichtige Phrasen oder Korrekturen hinzu.

Aktuelles Gespräch:
Mensch: Ich bin müde und ich möchte schlafen gehen
KI: Lass uns unser Gespräch auf Deutsch fortsetzen und eventuelle Fehler korrigieren:
"""

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt")

# Get the input_ids
input_ids = inputs["input_ids"]

# Check the size of input_ids
input_length = input_ids.size(1)  # The second dimension is the sequence length
print(f"Input length (number of tokens): {input_length}")

Input length (number of tokens): 199




4: Text to Speech
5: Play Audio