In [None]:
# Speech to Text
from transformers import pipeline
import torch
import os

# Select device correctly
device = 0 if torch.cuda.is_available() else -1

# Load transcriber
transcriber = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-tiny.en",
    device=device
)

def transcribe_audio(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
        
    print(f"Transcribing {file_path}...")

    import librosa

    audio, sr = librosa.load(file_path, sr=16000)
    result = transcriber(audio)
    return result["text"]

# Run

text=transcribe_audio(r"audio test/OAF_back_angry.wav")
print(text)

text

In [None]:
# Categorization
from langchain_google_genai import GoogleGenerativeAI
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("Gemini_api_key")
print(api_key)
llm = GoogleGenerativeAI(api_key=api_key, model="gemini-2.0-flash")
llm

llm.invoke("Hi")
from langchain_core.output_parsers import PydanticOutputParser
# structured output parser

from pydantic import BaseModel, Field

class Category(BaseModel):
    category: str = Field(description="The category of the input text")

parser = PydanticOutputParser(pydantic_object=Category)
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["input"],
    partial_variables={
        "format_instructions": parser.get_format_instructions()
    },
    template="""
You are MemoAI, an intelligent personal memory assistant.

Your task is to analyze the given input text (spoken or written thought)
and classify it into the most appropriate category or categories
based on the user's intent and context.

CRITICAL RULES (follow strictly):
- If the input is a short command, system instruction, test phrase, or greeting
  (e.g., "say the word back", "repeat this", "hello", "good morning"),
  ALWAYS classify it as **General**.
- Do NOT classify commands or repetition as Learning & Growth.
- Learning & Growth applies ONLY to intentional self-improvement or study.

Input:
{input}

Categories:

1. Daily Life
2. Work & Meetings
3. Learning & Growth
4. Health & Fitness
5. Money & Shopping
6. Entertainment & Leisure
7. Ideas & Creativity
8. General

Instructions:
- Choose MULTIPLE categories only if clearly applicable
- Otherwise choose ONE best category
- Output only the category name(s)
- No explanations

{format_instructions}
"""
)

chain  = prompt|llm|parser
text
chain.invoke(text).category

In [None]:
# Image based storage and retrieval
from google import genai
from google.genai import types
import os

os.environ["GOOGLE_API_KEY"] = os.getenv("Gemini_api_key")

client = genai.Client()

from PIL import Image

def gemini_image_description(image_path):
    image = Image.open(image_path)

    response = client.models.generate_content(
        model="gemini-2.5-pro",
        contents=[
            "Describe this image clearly. Mention shop type, items, and visual context.",
            image
        ]
    )

    return response.text

def build_multimodal_context(voice_text, image_description):
    return f"""
    Voice description: {voice_text}
    Image description: {image_description}
    """

def extract_metadata(context):
    prompt = f"""
    From the following context, extract structured data in JSON.

    Required fields:
    - object (what place or thing)
    - place
    - category
    - tags (list)

    Context:
    {context}

    Return ONLY valid JSON.
    """

    response = client.models.generate_content(
        model="gemini-2.5-pro",
        contents=prompt
    )

    return response.text

def build_search_text(voice_text, image_description, metadata_json):
    return f"""
    {voice_text}.
    {image_description}.
    {metadata_json}
    """

from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

def get_embedding(text):
    return embedder.encode(text)

import faiss
import numpy as np

dimension = 384
index = faiss.IndexFlatL2(dimension)
memory = []

def store_image_memory(embedding, record):
    index.add(np.array([embedding]))
    memory.append(record)

def ingest_image(image_path, voice_text):
    image_desc = gemini_image_description(image_path)

    context = build_multimodal_context(voice_text, image_desc)
    metadata = extract_metadata(context)

    search_text = build_search_text(
        voice_text, image_desc, metadata
    )

    embedding = get_embedding(search_text)

    store_image_memory(embedding, {
        "image_path": image_path,
        "voice_text": voice_text,
        "image_description": image_desc,
        "metadata": metadata
    })

    return "Image memory stored"

def search_image(query_text):
    query_embedding = get_embedding(query_text)

    D, I = index.search(np.array([query_embedding]), k=1)
    result = memory[I[0][0]]

    return result["image_path"]

ingest_image(
    image_path="image.png",
    voice_text="This is a sarees shop in Nandyal remember and give me when I ask for it later"
)

img = search_image("Give me the image of sarees shop in Nandyal")
print(img)

from PIL import Image
image = Image.open(img)
image.show()
import requests

API_KEY = "AIzaSyD_183prlg2ycO6Jd0bcxeljv6J-Tgcpvw"

url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent"
params = {"key": API_KEY}

payload = {
    "contents": [
        {
            "parts": [{"text": "ping"}]
        }
    ]
}

response = requests.post(url, params=params, json=payload)

print(response.status_code)
print(response.text)