<a href="https://colab.research.google.com/github/jessiechd/RAG_Model/blob/main/0623_5_retrieval_llm_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG: hybrid retrieval + reranker BGE

- ada beberapa tambahan function
- perubahan di function ```query_supabase``` (2 algoritma query baru yaitu ```query_supabase_hybrid``` dan ```query_supabase_bge```)

# reqs

In [6]:
!pip install supabase numpy psycopg2 vecs --q
!pip install torch transformers openai python-dotenv --q
!pip install scipy nltk fastapi uvicorn pgvector==0.3.2 psycopg2-binary --q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
import os
import json
import torch
import uuid
import numpy as np
from supabase import create_client, Client
from transformers import AutoTokenizer, AutoModel
import ast
import re
import vecs
from dotenv import load_dotenv
import openai
from scipy.spatial.distance import cosine
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import psycopg2
from pgvector.psycopg2 import register_vector
from pathlib import Path
from typing import List, Optional



nltk.download('all')
nltk.download('punkt')
nltk.download('stopwords')

env_path = Path(__file__).resolve().parents[1] / ".env"
load_dotenv(dotenv_path=env_path)
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
DB_CONNECTION = os.getenv("DB_CONNECTION")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
SUPABASE_BUCKET = os.getenv("SUPABASE_BUCKET")
openai.api_key = OPENAI_API_KEY

supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

vx = vecs.create_client(DB_CONNECTION)
vec_text = vx.get_or_create_collection(name="vec_text", dimension=768)
vec_table = vx.get_or_create_collection(name="vec_table", dimension=768)

tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True).to(
    torch.device("cuda" if torch.cuda.is_available() else "cpu"))

def get_embedding(text):
    """Generates an embedding vector from input text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().tolist()

def get_accessible_session_ids(supabase: Client, user_id: str):
    """Mengambil ID session yang dapat diakses oleh user berdasarkan role dan aturan session."""

    user_data = (
        supabase.table("users")
        .select("user_role, is_admin")
        .eq("id", user_id)
        .single()
        .execute()
    )

    if user_data.data is None:
        return []

    user_role = user_data.data["user_role"]
    is_admin = user_data.data["is_admin"]

    if is_admin:
        sessions = supabase.table("sessions").select("id").execute()
        if sessions.data is None:
            return []
        return [s["id"] for s in sessions.data]

    sessions = supabase.table("sessions").select("id, is_public, allowed_roles, created_by").execute()
    if sessions.data is None:
        return []

    accessible_ids = []

    for session in sessions.data:
        allowed_roles = session.get("allowed_roles", [])
        if isinstance(allowed_roles, str):
            allowed_roles = allowed_roles.split(",")
        if (
            session["is_public"]
            or session["created_by"] == user_id
            or user_role in allowed_roles
        ):
            accessible_ids.append(session["id"])

    return accessible_ids

def call_openai_llm(user_query, retrieved_chunks, chat_history=[]):
    """Send the query along with retrieved context and chat history to OpenAI API."""
    context_text = "\n\n".join([f"Chunk {i+1}: {chunk[2]}" for i, chunk in enumerate(retrieved_chunks)])
    print("\n[DEBUG] Context sent to LLM:")
    print(context_text[:500])
    messages = [
        {"role": "system", "content": "You are an intelligent assistant. Use the following retrieved information to answer the user's query."},
    ]
    messages.extend(chat_history)
    messages.append({"role": "user", "content": f"Context:\n{context_text}\n\nUser's Question: {user_query}"})
    client = openai.OpenAI(api_key=openai.api_key)
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=messages,
        temperature=0.7
    )
    answer = response.choices[0].message.content
    chat_history.append({"role": "user", "content": user_query})
    chat_history.append({"role": "assistant", "content": answer})
    return answer, chat_history

def chat():
    """Handles continuous chat interaction with support for new chat and conversational context."""
    chat_history = []
    print("Welcome to the assistant! Type 'exit' to end the chat, 'new chat' to start over.")

    while True:
        user_query = input("User: ")

        if user_query.lower() in ["exit", "quit"]:
            print("Chat ended.")
            break

        if user_query.lower() == "new chat":
            chat_history = []
            print("Starting a new chat...\n")
            continue


        retrieved_chunks = query_supabase(user_query)

        answer, chat_history = call_openai_llm(user_query, retrieved_chunks, chat_history)

        print(f"Assistant: {answer}\n")


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

NameError: name '__file__' is not defined

# query_supabase (current model)

In [None]:
def query_supabase(user_query, user_id, session_ids=None):
    query_embedding = get_embedding(user_query)
    embedding_str = ','.join([str(x) for x in query_embedding])

    conn = psycopg2.connect(DB_CONNECTION)
    register_vector(conn)
    cur = conn.cursor()

    TOP_K = 20

    query_text = f"""
        SELECT id, 1 - (vec <=> ARRAY[{embedding_str}]::vector) AS similarity
        FROM vecs.vec_text
        ORDER BY vec <=> ARRAY[{embedding_str}]::vector
        LIMIT {TOP_K}
    """
    cur.execute(query_text)
    text_chunk_ids = cur.fetchall()

    text_results = []
    if text_chunk_ids:
        chunk_id_list = tuple([str(row[0]) for row in text_chunk_ids])
        cur.execute(f"""
            SELECT chunk_id, content, metadata, session_id
            FROM public.documents_chunk
            WHERE chunk_id IN %s;
        """, (chunk_id_list,))
        text_chunks = {row[0]: row[1:] for row in cur.fetchall()}

        for cid, sim in text_chunk_ids:
            if cid in text_chunks:
                chunk = text_chunks[cid]
                session_id = chunk[2]
                if not session_ids or session_id in session_ids:
                    text_results.append((cid, "text", chunk[0], sim))

    query_table = f"""
        SELECT id, 1 - (vec <=> ARRAY[{embedding_str}]::vector) AS similarity
        FROM vecs.vec_table
        ORDER BY vec <=> ARRAY[{embedding_str}]::vector
        LIMIT {TOP_K}
    """
    cur.execute(query_table)
    table_chunk_ids = cur.fetchall()

    table_results = []
    if table_chunk_ids:
        chunk_id_list = tuple([str(row[0]) for row in table_chunk_ids])
        cur.execute(f"""
            SELECT chunk_id, description, metadata, session_id
            FROM public.tables_chunk
            WHERE chunk_id IN %s;
        """, (chunk_id_list,))
        table_chunks = {row[0]: row[1:] for row in cur.fetchall()}

        for cid, sim in table_chunk_ids:
            if cid in table_chunks:
                chunk = table_chunks[cid]
                session_id = chunk[2]
                if not session_ids or session_id in session_ids:
                    table_results.append((cid, "table", chunk[0], sim))

    conn.close()

    combined_results = text_results + table_results
    combined_results.sort(key=lambda x: x[3], reverse=True)

    return combined_results[:5]

# query_supabase_hybrid (hybrid vecs + bm25)
- function query_supabase with updated similarity (70% embedding similarity + 30% BM25 similarity)
- BM25 similarity = keyword search algorithm; good for answering short queries (short sentences, simple phrases)

- **updates:**
  1. penambahan function ``` hybrid_retrieve ```
  2. perubahan di bagian akhir function ```query_supabase```:

   ```combined_results = hybrid_retrieve(user_query, combined_results)```



In [None]:
import numpy as np
import ast
import re
import json
import psycopg2
from scipy.spatial.distance import cosine
from rank_bm25 import BM25Okapi

In [None]:
def hybrid_retrieve(user_query, all_chunks, top_k=10):
    documents = [chunk[2] for chunk in all_chunks]
    tokenized_corpus = [doc.split() for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)
    bm25_scores = bm25.get_scores(user_query.split())

    for i, chunk in enumerate(all_chunks):
        dense_sim = chunk[3] if chunk[3] else 0
        sparse_score = bm25_scores[i] if bm25_scores[i] else 0
        combined = 0.7 * dense_sim + 0.3 * sparse_score   # change similarity ratio
        all_chunks[i] = (*chunk, bm25_scores[i], combined)

    all_chunks.sort(key=lambda x: x[5], reverse=True)  # sort by combined score
    return all_chunks[:top_k]

In [None]:
def query_supabase_hybrid(user_query, user_id, session_ids=None):
    query_embedding = get_embedding(user_query)
    embedding_str = ','.join([str(x) for x in query_embedding])

    conn = psycopg2.connect(DB_CONNECTION)
    register_vector(conn)
    cur = conn.cursor()

    TOP_K = 20

    query_text = f"""
        SELECT id, 1 - (vec <=> ARRAY[{embedding_str}]::vector) AS similarity
        FROM vecs.vec_text
        ORDER BY vec <=> ARRAY[{embedding_str}]::vector
        LIMIT {TOP_K}
    """
    cur.execute(query_text)
    text_chunk_ids = cur.fetchall()

    text_results = []
    if text_chunk_ids:
        chunk_id_list = tuple([str(row[0]) for row in text_chunk_ids])
        cur.execute(f"""
            SELECT chunk_id, content, metadata, session_id
            FROM public.documents_chunk
            WHERE chunk_id IN %s;
        """, (chunk_id_list,))
        text_chunks = {row[0]: row[1:] for row in cur.fetchall()}

        for cid, sim in text_chunk_ids:
            if cid in text_chunks:
                chunk = text_chunks[cid]
                session_id = chunk[2]
                if not session_ids or session_id in session_ids:
                    text_results.append((cid, "text", chunk[0], sim))

    query_table = f"""
        SELECT id, 1 - (vec <=> ARRAY[{embedding_str}]::vector) AS similarity
        FROM vecs.vec_table
        ORDER BY vec <=> ARRAY[{embedding_str}]::vector
        LIMIT {TOP_K}
    """
    cur.execute(query_table)
    table_chunk_ids = cur.fetchall()

    table_results = []
    if table_chunk_ids:
        chunk_id_list = tuple([str(row[0]) for row in table_chunk_ids])
        cur.execute(f"""
            SELECT chunk_id, description, metadata, session_id
            FROM public.tables_chunk
            WHERE chunk_id IN %s;
        """, (chunk_id_list,))
        table_chunks = {row[0]: row[1:] for row in cur.fetchall()}

        for cid, sim in table_chunk_ids:
            if cid in table_chunks:
                chunk = table_chunks[cid]
                session_id = chunk[2]
                if not session_ids or session_id in session_ids:
                    table_results.append((cid, "table", chunk[0], sim))

    conn.close()

    combined_results = text_results + table_results
    combined_results = hybrid_retrieve(user_query, combined_results)

    return combined_results[:5]

# query_supabase_bge (BGE reranker)

- function query_supabase with updated hybrid similarity (70% embedding similarity + 30% BM25 similarity) and BGE reranker
- ``` hybrid_retrieve ``` sama seperti yang digunakan di ```query_supabase_hybrid```

- **perbedaan dari query_supabase_hybrid:**
  1. penambahan function ``` rerank_with_bge ``` (model cross-encoder for reranking retrieval results based on semantic similarity and textual data)
  2. penambahan function ``` rewrite_query ``` (rewrite query using TinyLlama agar lebih rapi dan embeddingnya bisa lebih terstruktur untuk sistem retrieval)
  3. perubahan di bagian akhir function ```query_supabase```:

   codeblock with comment ```### Optional```, can be implemented if computing resources are enough (BGE reranker lumayan lama processingnya. mungkin bisa di implementasi pada saat user meng-query konteks library saja)




In [None]:
!pip install rank_bm25 --q

In [None]:
import numpy as np
import ast
import re
import json
import psycopg2
from scipy.spatial.distance import cosine
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# --- BGE Reranker ---
def rerank_with_bge(query, chunks):
    model_name = "BAAI/bge-reranker-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    rerank_inputs = [
        (query, chunk[2]) for chunk in chunks  # chunk[2] = content
    ]

    inputs = tokenizer(
        [f"{q} [SEP] {p}" for q, p in rerank_inputs],
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    with torch.no_grad():
        scores = model(**inputs).logits.squeeze().tolist()

    if isinstance(scores, float):
        scores = [scores]

    reranked_chunks = [(*chunk, score) for chunk, score in zip(chunks, scores)]
    reranked_chunks.sort(key=lambda x: x[-1], reverse=True)
    return reranked_chunks

# --- TinyLlama Query Rewriter ---
from transformers import AutoTokenizer, AutoModelForCausalLM

def rewrite_query(original_query):
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    prompt = f"""You are a helpful query rewriter. Improve the following search query for a document retrieval system:
Original: {original_query}
Improved:"""

    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=30)
    rewritten = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the improved part after 'Improved:'
    if "Improved:" in rewritten:
        rewritten = rewritten.split("Improved:", 1)[1].strip()
    return rewritten

In [None]:
def query_supabase(user_query, user_id, session_ids=None):
    query_embedding = get_embedding(user_query)
    embedding_str = ','.join([str(x) for x in query_embedding])

    conn = psycopg2.connect(DB_CONNECTION)
    register_vector(conn)
    cur = conn.cursor()

    TOP_K = 20

    query_text = f"""
        SELECT id, 1 - (vec <=> ARRAY[{embedding_str}]::vector) AS similarity
        FROM vecs.vec_text
        ORDER BY vec <=> ARRAY[{embedding_str}]::vector
        LIMIT {TOP_K}
    """
    cur.execute(query_text)
    text_chunk_ids = cur.fetchall()

    text_results = []
    if text_chunk_ids:
        chunk_id_list = tuple([str(row[0]) for row in text_chunk_ids])
        cur.execute(f"""
            SELECT chunk_id, content, metadata, session_id
            FROM public.documents_chunk
            WHERE chunk_id IN %s;
        """, (chunk_id_list,))
        text_chunks = {row[0]: row[1:] for row in cur.fetchall()}

        for cid, sim in text_chunk_ids:
            if cid in text_chunks:
                chunk = text_chunks[cid]
                session_id = chunk[2]
                if not session_ids or session_id in session_ids:
                    text_results.append((cid, "text", chunk[0], sim))

    query_table = f"""
        SELECT id, 1 - (vec <=> ARRAY[{embedding_str}]::vector) AS similarity
        FROM vecs.vec_table
        ORDER BY vec <=> ARRAY[{embedding_str}]::vector
        LIMIT {TOP_K}
    """
    cur.execute(query_table)
    table_chunk_ids = cur.fetchall()

    table_results = []
    if table_chunk_ids:
        chunk_id_list = tuple([str(row[0]) for row in table_chunk_ids])
        cur.execute(f"""
            SELECT chunk_id, description, metadata, session_id
            FROM public.tables_chunk
            WHERE chunk_id IN %s;
        """, (chunk_id_list,))
        table_chunks = {row[0]: row[1:] for row in cur.fetchall()}

        for cid, sim in table_chunk_ids:
            if cid in table_chunks:
                chunk = table_chunks[cid]
                session_id = chunk[2]
                if not session_ids or session_id in session_ids:
                    table_results.append((cid, "table", chunk[0], sim))

    conn.close()

    combined_results = text_results + table_results
    #### Optional: Query Expansion using TinyLlama ####
    expanded_query = rewrite_query(user_query)
    if expanded_query:
        user_query = expanded_query

    #### Hybrid Retrieval Scoring (BM25 + Dense) ####
    top_hybrid = hybrid_retrieve(user_query, combined_results, top_k=top_k * 2)

    #### Optional: Reranking using bge-reranker ####
    reranked = rerank_with_bge(user_query, top_hybrid)
    final_results = reranked[:top_k]

    return final_results[:5]

# main program

In [None]:



if __name__ == "__main__":
    chat()



