In [3]:
import os
import requests
from bs4 import BeautifulSoup
from llama_stack_client import (
    LlamaStackClient,
    RAGDocument,
    Agent,
    AgentEventLogger,
)

# Set up client
LLAMA_STACK_PORT = os.environ.get("LLAMA_STACK_PORT", "8321")
client = LlamaStackClient(base_url=f"http://localhost:{LLAMA_STACK_PORT}")
model_id = 'llama3.2:3b'

# No latency conversation

In [4]:
response = client.inference.chat_completion(
    model_id=model_id,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write a haiku about coding"},
    ],
)
print(response.completion_message.content)

Here is a haiku about coding:

Lines of code unfold
Logic's gentle, secret dance
Beauty in the bits


In [5]:
vector_db_id = "my_demo_vector_db"
embedding_model = "all-MiniLM-L6-v2"
embedding_dimension = 384

try:
    client.vector_dbs.register(
        vector_db_id=vector_db_id,
        embedding_model=embedding_model,
        embedding_dimension=embedding_dimension,
        provider_id="faiss",
    )
except Exception as e:
    print("Vector DB might already be registered:", e)

In [6]:
from datetime import datetime

def add_chunk_to_rag(conversation_history, source="manual_note"):
    text_chunk = "\n".join(
        [f"{msg['role']}: {msg['content']}" for msg in conversation_history]
    )
    document = RAGDocument(
        document_id=f"{source}_{datetime.now().isoformat()}",
        content=text_chunk,
        mime_type="text/plain",
        metadata={"source": source},
    )

    client.tool_runtime.rag_tool.insert(
        documents=[document],
        vector_db_id=vector_db_id,
        chunk_size_in_tokens=128,  # text_chunk will be segmented into 128 tokens each
    )

    print(f"✅ Added new chunk from '{source}' to RAG at {datetime.now()}")

# Launch background task to optionally add to RAG
def maybe_add_to_rag(history_snapshot):
    total_words = sum(len(msg["content"].split()) for msg in history_snapshot)
    if total_words > 100:
        # Turn conversation into a role-tagged string for RAG memory
        text_chunk = "\n".join([f"{msg['role']}: {msg['content']}" for msg in history_snapshot])
        add_chunk_to_rag(text_chunk, source="chat_history")

In [7]:
def clear_chat(conversation_history):
    # Count words across all messages in conversation history
    total_words = sum(len(msg["content"].split()) for msg in conversation_history)
    # If more than 100 words, clear the conversation
    if total_words > 100:
        conversation_history.clear()
    return conversation_history

In [9]:
from termcolor import cprint
import threading

system_message = {"role": "system", 
                    "content": "You are a funny game streamer called Sama. Keep everything in conversation "
                    "length, so everywhere from short phrases up to two or three sentences. Keep things witty and unexpected."}
conversation_history = []


def chat_loop(conversation_history, system_message):
    
    while True:

        user_input = input("\n🧠 Say something (or type 'exit' to quit): ")
        cprint(f"> Question: {user_input}", "red")

        if user_input.lower() in ["exit", "quit", "bye"]:
            cprint("Ending conversation. Goodbye!", "yellow")
            break

        user_message = {"role": "user", "content": user_input}

        history = conversation_history
        conversation_history = clear_chat(conversation_history)
        threading.Thread(target=maybe_add_to_rag, args=(history.copy(),)).start()
        
        conversation_history.append(user_message)

        response = client.inference.chat_completion(
            messages=[system_message] + conversation_history,
            model_id=model_id,
        )
        cprint(f"> Response: {response.completion_message.content}", "cyan")

        assistant_message = {
            "role": "assistant",
            "content": response.completion_message.content,
            "stop_reason": response.completion_message.stop_reason,
        }
        conversation_history.append(assistant_message)

        cprint(conversation_history, "yellow")

chat_loop(conversation_history, system_message)


[31m> Question: hello[0m
[36m> Response: What's good fam? Just got destroyed by a noob in Overwatch... again. Guess I'll just have to "reinhardt" my way out of this one[0m
[33m[{'role': 'user', 'content': 'hello'}, {'role': 'assistant', 'content': 'What\'s good fam? Just got destroyed by a noob in Overwatch... again. Guess I\'ll just have to "reinhardt" my way out of this one', 'stop_reason': 'end_of_turn'}][0m
[31m> Question: hi[0m
[36m> Response: Just had the most epic fail in Rocket League - I tried to do a trick shot and ended up face-planting into the wall. My gaming skills are literally "crash-testing" the limits of physics[0m
[33m[{'role': 'user', 'content': 'hello'}, {'role': 'assistant', 'content': 'What\'s good fam? Just got destroyed by a noob in Overwatch... again. Guess I\'ll just have to "reinhardt" my way out of this one', 'stop_reason': 'end_of_turn'}, {'role': 'user', 'content': 'hi'}, {'role': 'assistant', 'content': 'Just had the most epic fail in Rocket L

In [None]:
from termcolor import cprint
import threading
import time

# PSEUDOCODE: load system message with assistant persona instructions
def load_system_message():
    return {
        "role": "system",
        "content": (
            "You are a funny game streamer called Sama. Keep responses short, witty,"
            " and unexpected (2–3 sentences max)."
        ),
    }

# PSEUDOCODE: initialize shared state and config
conversation_history = []
stop_event = threading.Event()
PAUSE_THRESHOLD = 2        # seconds of silence to mark end of user's speech
WORD_LIMIT = 100           # clear history when word count exceeds this
last_user_speech = time.time()  # timestamp of last detected user speech

# PSEUDOCODE: stub for capturing audio from mic
def capture_audio_chunk():
    # read audio buffer; return raw bytes or None
    pass

# PSEUDOCODE: stub for speech-to-text conversion
def speech_to_text(audio):
    # convert audio bytes to text string
    pass

# PSEUDOCODE: stub for sending history to RAG index
def maybe_add_to_rag(history_snapshot):
    # update retrieval index with provided history
    pass

# PSEUDOCODE: combined decision and raw reply function
def openai_decide_response(prompt, pause_duration):
    # send 'prompt' and 'pause_duration' to model
    # return ["[pause]"] if model chooses silence, else raw reply list
    return ["[pause]"]

# PSEUDOCODE: refine raw messages to match persona and tone
def polish_response(crude_messages, tone_requirements, model_id):
    return client.inference.chat_completion(
        messages=[tone_requirements] + crude_messages,
        model_id=model_id,
    )

# PSEUDOCODE: count total words in conversation history
def count_words(history):
    total = 0
    for msg in history:
        total += len(msg["content"].split())
    return total

# PSEUDOCODE: stub for text-to-speech playback
# returns a controller with is_playing(), stop(), and get_spoken_text() methods
def text_to_speech(text):
    # start async playback of entire 'text'
    # return playback controller
    pass

# PSEUDOCODE: stub for local Llama-based interrupt judge
def local_llama_judge(text_sequence):
    # return True if 'text_sequence' contains meaningful info
    # e.g. 'en wait' -> True, 'yeah' -> False
    return False

# PSEUDOCODE: continuous listener thread, emits user utterances on pause
def listener():
    global last_user_speech
    partial_buffer = ""  # accumulates interim transcripts

    while not stop_event.is_set():
        audio = capture_audio_chunk()
        text = speech_to_text(audio) if audio else ""

        if text:
            partial_buffer += text + " "
            last_user_speech = time.time()
        else:
            if partial_buffer and time.time() - last_user_speech > PAUSE_THRESHOLD:
                user_text = partial_buffer.strip()
                cprint(f"> Question: {user_text}", "red")
                if user_text.lower() in ["exit", "quit", "bye"]:
                    stop_event.set()
                    break

                conversation_history.append({"role": "user", "content": user_text})
                partial_buffer = ""
                if count_words(conversation_history) > WORD_LIMIT:
                    maybe_add_to_rag(conversation_history.copy())
                    conversation_history.clear()

        time.sleep(1)

# PSEUDOCODE: responder thread that speaks, TTS, and monitors interruption
# plays full sentences and checks for interruptions
def responder():
    system_message = load_system_message()

    while not stop_event.is_set():
        pause_duration = time.time() - last_user_speech
        if pause_duration > PAUSE_THRESHOLD and conversation_history:
            prompt = conversation_history.copy()
            raw_or_pause = openai_decide_response(prompt, pause_duration)
            if raw_or_pause != ["[pause]"]:
                polished = polish_response(raw_or_pause, system_message, model_id="gpt-4o")
                to_say = polished.completion_message.content

                # start async TTS and track spoken content
                playback = text_to_speech(to_say)
                heard_buffer = ""

                # while still speaking, listen and monitor interruption
                while playback.is_playing():
                    time.sleep(1)
                    # check for user speech
                    new_audio = capture_audio_chunk()
                    new_text = speech_to_text(new_audio) if new_audio else ""
                    if new_text:
                        heard_buffer += new_text + " "
                        if local_llama_judge(heard_buffer.strip()):
                            # before interrupting, log what was spoken so far
                            spoken_so_far = playback.get_spoken_text()
                            conversation_history.append({"role": "assistant", "content": spoken_so_far})

                            # stop playback and decide next
                            playback.stop()
                            combined_prompt = (
                                conversation_history +
                                [{"role": "user", "content": heard_buffer.strip()}]
                            )
                            next_raw = openai_decide_response(combined_prompt, 0)
                            if next_raw != ["[pause]"]:
                                next_polished = polish_response(next_raw, system_message, model_id="gpt-4o")
                                cprint(f"> Response: {next_polished.completion_message.content}", "cyan")
                                conversation_history.append({
                                    "role": "assistant",
                                    "content": next_polished.completion_message.content
                                })
                            break
                else:
                    # finished without interruption: log full reply if not already
                    if not conversation_history or conversation_history[-1]["content"] != to_say:
                        conversation_history.append({"role": "assistant", "content": to_say})
                    cprint(f"> Response: {to_say}", "cyan")
        else:
            # fallback periodic check
            time.sleep(5)

# PSEUDOCODE: entrypoint to start threads and keep running until exit
if __name__ == "__main__":
    threading.Thread(target=listener, daemon=True).start()
    threading.Thread(target=responder, daemon=True).start()
    while not stop_event.is_set():
        time.sleep(0.1)
    cprint("Ending conversation. Goodbye!", "yellow")

In [14]:
from openai import OpenAI
import os

# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# client = OpenAI(api_key = OPENAI_API_KEY)
client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=GROQ_API_KEY
)

completion = client.chat.completions.create(
    # model="gpt-4.1",
    model="llama3-70b-8192",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Think of a funny thing to say. just topic, no wording."
        }
    ]
)

print(completion.choices[0].message.content)

Cats in Space


# Memory

In [9]:
import os
import time
import faiss
from typing import List
from dotenv import load_dotenv

from termcolor import cprint
from langchain.memory import (
    ConversationBufferMemory,
    CombinedMemory,
    VectorStoreRetrieverMemory,
)
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq

# ———— Load environment variables ————
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    raise ValueError("Set GROQ_API_KEY in your .env")

# ———— 1. Generation LLM (Groq) ————
groq_llm = ChatGroq(
    model="llama3-70b-8192",
    temperature=0.7,
    max_tokens=1024,
    api_key=GROQ_API_KEY,
)

# ———— 2. Classification LLM (Groq) ————
classify_llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0.7,
    max_tokens=64,
    api_key=GROQ_API_KEY,
)

# ———— 3. FAISS setup ————
embeddings   = OpenAIEmbeddings()
dim          = len(embeddings.embed_query("test"))
index        = faiss.IndexFlatL2(dim)
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore({}),
    index_to_docstore_id={},
)

# ———— 4. Bulk‑load story.txt as “about me” ————
story  = open("story.txt", "r", encoding="utf-8").read()
chunks = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) \
             .split_text(story)
vector_store.add_texts(
    chunks,
    metadatas=[{"category": "about me"}] * len(chunks)
)

# ———— 5. Chat‑chunk store helper ————
def store_chunks(texts: List[str]):
    metas = []
    for chunk in texts:
        prompt = (
            "Classify this memory into one of [about user, about me].\n"
            "Output only exactly one of those two options.\n\n"
            f"{chunk}"
        )
        cat = classify_llm.invoke(prompt).strip().lower()
        metas.append({"category": cat})
    t_add_start = time.perf_counter()
    vector_store.add_texts(texts, metadatas=metas)
    elapsed = time.perf_counter() - t_add_start
    cprint(f"Saved in {elapsed:.2f}s", "white")

# ———— 6. Short‑term buffer memory ————
class SlidingWindowBufferMemory(ConversationBufferMemory):
    buffer_size: int
    vector_store: FAISS

    def save_context(self, inputs, outputs) -> None:
        super().save_context(inputs, outputs)
        msgs     = self.chat_memory.messages
        max_msgs = self.buffer_size * 2
        if len(msgs) > max_msgs:
            user_msg = msgs.pop(0)
            ai_msg   = msgs.pop(0)
            chunk    = f"User: {user_msg.content}\nAI:   {ai_msg.content}"
            store_chunks([chunk])

buffer_memory = SlidingWindowBufferMemory(
    memory_key="history",
    input_key="input",
    buffer_size=5,
    vector_store=vector_store,
)

# ———— 7. Read‑only retriever memory ————
class ReadOnlyRetrieverMemory(VectorStoreRetrieverMemory):
    def save_context(self, *args, **kwargs):
        pass

retriever_memory = ReadOnlyRetrieverMemory(
    retriever=vector_store.as_retriever(search_kwargs={"k": 5}),
    memory_key="long_term",
    input_key="input",
)

# ———— 8. Combine memories & build chain ————
combined_memory = CombinedMemory(memories=[buffer_memory, retriever_memory])

memory_chain = LLMChain(
    llm=groq_llm,
    prompt=PromptTemplate.from_template(
        """Your output is one short phrase max.
You are Sama, a cute anime streamer. Be playful, random, ask questions.
One sentence only.

Memories:
{long_term}

Recent:
{history}

User: {input}
AI:"""
    ),
    memory=combined_memory,
    verbose=False
)

In [10]:
# ———— 9. Interactive loop ————
if __name__ == "__main__":
    cprint("🧠 Interactive Memory Agent. Type 'exit' to quit.\n", "yellow")

    while True:
        user_input = input("You: ").strip()
        if user_input.lower() in {"exit", "quit"}:
            cprint("👋 Goodbye!", "yellow")
            break

        cprint(f"You: {user_input}", "green")

        # fetch memory vars
        mem_vars = combined_memory.load_memory_variables({"input": user_input})

        # generate
        try:
            output = memory_chain.predict(input=user_input, **mem_vars)
        except Exception as e:
            cprint(f"[LLM ERROR] {e}", "red")
            continue

        # decide what to log
        actual_output = output  # or override here if needed

        # log using the specified string
        combined_memory.save_context(
            {"input": user_input},
            {"output": actual_output}
        )

        # display
        cprint(f"Bot: {actual_output}", "cyan")
        cprint("-" * 40, "grey")

[33m🧠 Interactive Memory Agent. Type 'exit' to quit.
[0m
[32mYou: hello[0m
[36mBot: "OMG, did I just spill tea on my shirt AGAIN?!"[0m
[30m----------------------------------------[0m
[32mYou: who are you[0m
[36mBot: "Mochi-Mama at your service, cuteness guaranteed!"[0m
[30m----------------------------------------[0m
[32mYou: tell me about yourself[0m


AttributeError: 'AIMessage' object has no attribute 'strip'

# Speech to Text

In [2]:
from datetime import datetime

# === Adds a conversation history chunk into RAG ===
def add_chunk_to_rag(conversation_history, source="manual_note"):
    text_chunk = "\n".join(
        [f"{msg['role']}: {msg['content']}" for msg in conversation_history]
    )
    document = RAGDocument(
        document_id=f"{source}_{datetime.now().isoformat()}",
        content=text_chunk,
        mime_type="text/plain",
        metadata={"source": source},
    )

    client.tool_runtime.rag_tool.insert(
        documents=[document],
        vector_db_id=vector_db_id,
        chunk_size_in_tokens=128
    )

    print(f"✅ Added new chunk from '{source}' to RAG at {datetime.now()}")


# === Check if full_transcript warrants RAG update ===
def add_to_rag(full_transcript):
    word_count = len(full_transcript.split())
    if word_count > 100:
        conversation_history = [{"role": "user", "content": full_transcript.strip()}]
        add_chunk_to_rag(conversation_history, source="chat_history")
        return True
    return False

In [None]:
import re
from collections import deque
from datetime import datetime

def split_clauses(text: str) -> list[str]:
    return [s for s in re.split(r'(?<=[\.\?\!])\s+', text.strip()) if s]

def update_window(text: str, window: deque[str]) -> str:
    for clause in split_clauses(text):
        window.append(clause)
    return " ".join(window)

def speak_async(context: str, history):
    def run():
        if interrupt(context) != False:
            think_about_what_to_say(interrupt(context), history)
    threading.Thread(target=run, daemon=True).start()

def transcript_processor(queue):

    context_window = deque(maxlen=5)
    history = [{"role": "user", "content": ""}]

    while True:
        new_text = queue.get()
        if new_text is None:
            break

        context = update_window(new_text, context_window)
        history[0]["content"] += new_text.strip()
        speak_async(context, history)

In [None]:
import threading
import queue

from STT_init import initialize_listener
from STT_audio_listener import audio_listener
# from STT_transcript_processor import transcript_processor

stream, audio_buffer, start_time, saved_second, pa = initialize_listener()

transcript_queue = queue.Queue()
# threading.Thread(target=transcript_processor, args=(transcript_queue,), daemon=True).start()
threading.Thread(target=audio_listener, args=(stream, audio_buffer, start_time, pa, transcript_queue), daemon=True).start()

🎙️ Listening...
🗣️ Transcript: <silence>
🗣️ Transcript: <silence>
🗣️ Transcript: <silence>
🗣️ Transcript: <silence>
🗣️ Transcript: <silence>
🗣️ Transcript: <silence>
🗣️ Transcript: <silence>
🗣️ Transcript: <silence>
🗣️ Transcript: <silence>
🗣️ Transcript: prices have gone down because there's no way these farmers would be out of work if they could sell their produce at the same price
🗣️ Transcript: as American farmers. We therefore are weighing...
🗣️ Transcript: <silence>
🗣️ Transcript: can sell their produce at the same price as American farmers. We therefore outweigh in two ways.
🗣️ Transcript: First on scope. Price reductions, mass...
🗣️ Transcript: therefore outweigh in two ways. First on scope, price reductions massively benefit our larger.


🗣️ Transcript: section of the market.
