<a href="https://colab.research.google.com/github/jessiechd/RAG_Model/blob/main/0725_SessionHistory%26Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# load Tinyllama

In [None]:
import re
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# === Load TinyLlama ===
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16
)
model.eval()

# === Inference Wrapper ===
def run_tiny_llama(prompt, max_new_tokens=256):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def build_chunk_prompt(mini_chunks):
    annotated = "\n".join([f"<CHUNK{i}> {text}" for i, text in enumerate(mini_chunks)])
    instructions = (
        "You are a smart text segmenter. Group the annotated mini-chunks into larger, semantically coherent chunks.\n"
        "Each chunk should combine 2-4 mini-chunks that belong together in meaning.\n"
        "Respond with groups using this format:\n\n"
        "Chunk 1: <CHUNK0>, <CHUNK1>\n"
        "Chunk 2: <CHUNK2>, <CHUNK3>, <CHUNK4>\n"
    )
    return f"{instructions}\n\n{annotated}"

def agentic_chunk_text(text, section_title, max_chars=300):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    mini_chunks, temp = [], ""
    for sent in sentences:
        if len(temp) + len(sent) < max_chars:
            temp += " " + sent
        else:
            mini_chunks.append(temp.strip())
            temp = sent
    if temp:
        mini_chunks.append(temp.strip())

    if len(mini_chunks) <= 1:
        return [f"## {section_title}\n" + text.strip()], ["Kept as one chunk"]

    if len(mini_chunks) > 12:
        mini_chunks = mini_chunks[:12]

    prompt = build_chunk_prompt(mini_chunks)
    raw_output = run_tiny_llama(prompt)

    grouped_chunks = []
    explanation = []
    for line in raw_output.splitlines():
        if line.startswith("Chunk"):
            refs = re.findall(r"<CHUNK(\d+)>", line)
            if refs:
                valid_refs = [int(i) for i in refs if int(i) < len(mini_chunks)]
                if not valid_refs:
                    continue
                group_text = " ".join([mini_chunks[i] for i in valid_refs])
                cleaned_text = re.sub(r"\s+", " ", group_text.strip())
                grouped_chunks.append(f"## {section_title}\n" + cleaned_text)
                if len(valid_refs) > 1:
                    explanation.append(f"Chunk {len(grouped_chunks)}: grouped {len(valid_refs)} mini-chunks")
                else:
                    explanation.append(f"Chunk {len(grouped_chunks)}: kept mini-chunk")
    return grouped_chunks, explanation



# supabase setup + initialize vecs

In [None]:
!pip install supabase numpy psycopg2 --q

In [None]:
!pip install transformers sentencepiece -q

In [None]:
import os
import json
import torch
import uuid
import numpy as np
from supabase import create_client, Client
from transformers import AutoTokenizer, AutoModel

# Initialize Supabase

SUPABASE_URL =
SUPABASE_KEY =


supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

# Load Embedding Model
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


In [None]:
!pip install vecs --q

In [None]:
import vecs
from vecs.adapter import Adapter, ParagraphChunker, TextEmbedding

DB_CONNECTION =

vx = vecs.create_client(DB_CONNECTION)


In [None]:
vec_text = vx.get_or_create_collection(name="vec_text", dimension=768)
vec_table = vx.get_or_create_collection(name="vec_table", dimension=768)

# embedding + store to DB

In [None]:
def get_embedding(text):
    """Generates an embedding vector from input text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().tolist()

# embs = get_embedding("Uses of Machine learning")
# print(embs.shape)

In [None]:

def generate_table_description(table_data):
    """Generates a natural language description from a table's headers and rows."""
    headers = table_data["headers"]
    rows = table_data["rows"]

    description = []
    for row in rows:
        row_text = ", ".join([f"{headers[i]}: {row[i]}" for i in range(len(headers))])
        description.append(row_text)

    return " | ".join(description)  # Separate rows with "|"

def convert_table_to_text(table_data, metadata):
    """Converts a table (headers + rows) into a structured text format with metadata and description for embedding."""
    headers = ", ".join(table_data["headers"])
    rows = [" | ".join(row) for row in table_data["rows"]]

    # Retrieve metadata fields
    table_title = metadata.get("table_title", "Unknown Table")
    section = metadata.get("section", "Unknown Section")

    # Generate description from table data
    table_description = generate_table_description(table_data)

    # Combine metadata with table content
    return (
        f"Table Title: {table_title}. Section: {section}.\n"
        f"Table Data:\nHeaders: {headers}\n" + "\n".join(rows) +
        f"\nDescription: {table_description}"
    ), table_description  # Return both formatted text & natural description


In [None]:

def store_chunks_in_supabase(chunks):
    """Stores text and table chunks into Supabase with improved embeddings."""
    document_entries = []
    table_entries = []
    text_records = []
    table_records = []

    for chunk in chunks:
        chunk_id = str(uuid.uuid4())  # Generate unique chunk_id

        # Process text content
        if "content" in chunk and chunk["content"]:
            content = chunk["content"]
            embedding = get_embedding(content)

            document_entries.append({
                "chunk_id": chunk_id,
                "content": content,
                "metadata": chunk["metadata"],
                "type": "text"
            })
            text_records.append((chunk_id, embedding, chunk["metadata"]))

        # Process table data
        if "table" in chunk and chunk["table"]:
            table_data = chunk["table"]
            metadata = chunk.get("metadata", {})

            # Generate both structured table text & natural description
            table_text, table_description = convert_table_to_text(table_data, metadata)
            table_embedding = get_embedding(table_text)

            table_entries.append({
                "chunk_id": chunk_id,
                "table_data": json.dumps(table_data, ensure_ascii=False),
                # "embedding": table_embedding,
                "description": table_description,  # Store the generated description
                "metadata": metadata
            })
            table_records.append((chunk_id, table_embedding, metadata))

    # Batch insert into Supabase
    if document_entries:
        supabase.table("documents").insert(document_entries).execute()
    if table_entries:
        supabase.table("tables").insert(table_entries).execute()

    vec_text.upsert(records=text_records)
    vec_table.upsert(records=table_records)


# query embeddings

In [None]:
import numpy as np
import ast
import re
from scipy.spatial.distance import cosine
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('all')
nltk.download('punkt')
nltk.download('stopwords')

def get_embedding(text):
    """Generates an embedding vector from input text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().tolist()

def extract_keywords_simple(text):
    """Extracts important words from a query using simple filtering."""
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    keywords = [word for word in words if word.isalnum() and word not in stop_words]
    return keywords

def query_requires_table(user_query):
    """Determines if the query is likely asking for table data."""
    table_keywords = {"table", "data", "values", "measurements", "limits", "thresholds", "parameters", "average", "sum", "percentage"}
    return any(word in user_query.lower() for word in table_keywords)

def get_most_similar_keywords(query_keywords, top_text_chunks):
    """Extracts most relevant words from top retrieved text chunks."""
    all_text_words = set()
    for chunk in top_text_chunks:
        chunk_words = set(word_tokenize(chunk[2].lower()))  # Extract words from chunk text
        all_text_words.update(chunk_words)
    common_words = [word for word in query_keywords if word in all_text_words]
    return common_words if common_words else query_keywords  # Fallback to original keywords if no match



# hybrid bm25 + vec

In [None]:
!pip install rank_bm25 --q

In [None]:
import numpy as np
import ast
import re
import json
import psycopg2
from scipy.spatial.distance import cosine
from rank_bm25 import BM25Okapi

# Assume: get_embedding(), extract_keywords_simple(), query_requires_table() are already defined

def hybrid_retrieve(user_query, all_chunks, top_k=10):
    documents = [chunk[2] for chunk in all_chunks]  # chunk = (id, type, content, sim?)
    tokenized_corpus = [doc.split() for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)
    bm25_scores = bm25.get_scores(user_query.split())

    for i, chunk in enumerate(all_chunks):
        dense_sim = chunk[3] if chunk[3] else 0
        sparse_score = bm25_scores[i] if bm25_scores[i] else 0
        combined = 0.7 * dense_sim + 0.3 * sparse_score
        all_chunks[i] = (*chunk, bm25_scores[i], combined)

    all_chunks.sort(key=lambda x: x[5], reverse=True)  # sort by combined score
    return all_chunks[:top_k]

def query_supabase(user_query, top_k=5):
    """Hybrid Retrieval (BM25 + Dense Embedding) without reranking."""
    query_embedding = np.array(get_embedding(user_query), dtype=np.float32).flatten()
    keywords = extract_keywords_simple(user_query)
    requires_table = query_requires_table(user_query)
    query_list = query_embedding.tolist()

    conn = psycopg2.connect(DB_CONNECTION)
    cur = conn.cursor()

    ##### TEXT CHUNKS #####
    cur.execute("""
        SELECT id, 1 - (vec <=> %s) AS similarity
        FROM vecs.vec_text
        ORDER BY vec <=> %s
        LIMIT 10
    """, (json.dumps(query_list), json.dumps(query_list)))
    text_chunk_ids = cur.fetchall()

    text_results = []
    if text_chunk_ids:
        chunk_id_list = tuple([str(row[0]) for row in text_chunk_ids])
        cur.execute(f"""
            SELECT chunk_id, content, metadata
            FROM public.documents
            WHERE chunk_id IN {chunk_id_list};
        """)
        text_chunks = {row[0]: row[1:] for row in cur.fetchall()}
        text_results = [(cid, "text", text_chunks[cid][0], sim) for cid, sim in text_chunk_ids if cid in text_chunks]

    ##### TABLE CHUNKS #####
    cur.execute("""
        SELECT id, 1 - (vec <=> %s) AS similarity
        FROM vecs.vec_table
        ORDER BY vec <=> %s
        LIMIT 10
    """, (json.dumps(query_list), json.dumps(query_list)))
    table_chunk_ids = cur.fetchall()

    table_results = []
    if table_chunk_ids:
        chunk_id_list = tuple([str(row[0]) for row in table_chunk_ids])
        cur.execute(f"""
            SELECT chunk_id, description, metadata
            FROM public.tables
            WHERE chunk_id IN {chunk_id_list};
        """)
        table_chunks = {row[0]: row[1:] for row in cur.fetchall()}
        table_results = [(cid, "table", table_chunks[cid][0], sim) for cid, sim in table_chunk_ids if cid in table_chunks]

    conn.close()

    #### Combine Results and Run Hybrid ####
    all_results = text_results + table_results
    top_hybrid = hybrid_retrieve(user_query, all_results, top_k=top_k)

    return top_hybrid


# LLM function

In [None]:
import openai

# OpenAI API Key
OPENAI_API_KEY =
openai.api_key = OPENAI_API_KEY


In [None]:
def call_openai_llm(user_query, retrieved_chunks, chat_history=[]):
    """Send the query along with retrieved context and chat history to OpenAI API."""

    # 🔹 Sanitize chat history (make sure all entries are dicts)
    safe_history = []
    for msg in chat_history:
        if isinstance(msg, dict) and "role" in msg and "content" in msg:
            safe_history.append(msg)
        else:
            print("⚠️ Skipping malformed chat history entry:", msg)

    # 🔹 Prepare context from retrieved chunks
    context_text = "\n\n".join([f"Chunk {i+1}: {chunk[2]}" for i, chunk in enumerate(retrieved_chunks)])

    # 🔹 Construct messages for OpenAI Chat API
    messages = [
        {"role": "system", "content": "You are an intelligent assistant. Use the following retrieved information to answer the user's query."},
        *safe_history,
        {"role": "user", "content": f"Context:\n{context_text}\n\nUser's Question: {user_query}"}
    ]

    # 🔹 Make API call
    client = openai.OpenAI(api_key=openai.api_key)  # New client-style API
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=messages,
        temperature=0.7
    )

    # 🔹 Extract and update chat history
    answer = response.choices[0].message.content
    chat_history.append({"role": "user", "content": user_query})
    chat_history.append({"role": "assistant", "content": answer})

    return answer, chat_history


# extract NER function (spacy model) 🆕

In [None]:
!python -m spacy download en_core_web_sm --q


In [None]:
import spacy
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from typing import List, Dict, Tuple

# Load SpaCy NER model once
_spacy_nlp = spacy.load("en_core_web_sm")

def extract_entities(text: str) -> Dict[str, List[str]]:
    entities = {}
    doc = _spacy_nlp(text)
    for ent in doc.ents:
        entities.setdefault(ent.label_, set()).add(ent.text)

    # Convert sets to lists
    return {k: list(v) for k, v in entities.items()}


In [None]:
# contoh chat history (raw text) buat di extract entitynya

hist = """🔹 Input Query:
explain the process of medical training

 🔹 Chatbot Response:
 The process of medical training, particularly for family medicine in Africa as described in the provided context, involves a comprehensive, hands-on approach that takes place primarily in clinical settings such as clinics, district hospitals, or regional hospitals. Here is a summary of the key components of this training process:

1. **Clinical Workplace Training**: Trainees receive their education directly at the bedside or next to the patient in various medical settings. This method emphasizes real-world experiences where direct patient contact is central to the learning process.

2. **Supervised Learning**: An essential part of the training involves continuous direct supervision by senior medical professionals. Supervisors observe trainees during their interactions with patients and provide immediate feedback and reflection. This direct supervision facilitates an interactive learning environment crucial for developing practical skills.

3. **Learning Tools and Assessment**: Various tools are employed to enhance and evaluate the learning process:
   - **Mini-Clinical Evaluation Exercise (CEX)** and **Direct Observation of Procedural Skills (DOPS)** are used to assess specific skills during patient encounters.
   - **Logbooks** track the progress and experiences of the trainees.
   - **Global Assessment Tool** is used for broader evaluation, typically at the end of each rotation, involving both the trainee and the supervisor. This is complemented by continuous assessments every two weeks where trainees present topics and undergo quizzes.

4. **Integrated Assessments**: The training incorporates both continuous assessments and specific periodical reviews (like quarterly assessments in some programs). These assessments often involve practical tests and reflection sessions that help in gauging the progress and understanding of the trainees.

5. **Accreditation and Networked Training**: There is an emphasis on the accreditation of training facilities and creating a network of teaching campuses across different locations. This network allows for a diverse and comprehensive training experience, balancing between different hospital settings and primary health care environments.

6. **Workshops and Conferences**: Participation in broader workshops and conferences, such as those organized by the World Organisation of Family Doctors (WONCA), also plays a role in training by providing additional learning opportunities and exposure to the global medical community.

Overall, the training process is dynamic and interactive, focusing on direct patient care, continuous feedback, and structured assessments to ensure comprehensive learning and professional development in family medicine.
 """




print(extract_entities(hist))

# context  (chat history + entity) and summarizer 🆕

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

SUMMARIZER_MODEL_ID = "t5-small"
summarizer_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZER_MODEL_ID)
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZER_MODEL_ID)

from transformers import PreTrainedTokenizerFast
llm_tokenizer = PreTrainedTokenizerFast.from_pretrained("gpt2")

def count_tokens(text):
    return len(llm_tokenizer.encode(text))

In [None]:
class ChatContextManager:
    def __init__(self, summarize_every_turns=3, summarize_every_tokens=1000, ner_model="spacy_sm"):
        self.chat_history: List[Dict] = []
        self.summary: str = ""
        self.turns_since_last_summary: int = 0
        self.summarize_every_turns = summarize_every_turns
        self.summarize_every_tokens = summarize_every_tokens
        self.ner_model = ner_model

    def summarize_text(self, text, max_input_tokens=512, max_output_tokens=150):
        inputs = summarizer_tokenizer.encode(
            "summarize: " + text,
            return_tensors="pt",
            max_length=max_input_tokens,
            truncation=True
        )
        summary_ids = summarizer_model.generate(
            inputs,
            max_length=max_output_tokens,
            min_length=30,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        return summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    def add_turn(self, user_msg: str, bot_msg: str):
        user_ents = extract_entities(user_msg, self.ner_model)
        bot_ents = extract_entities(bot_msg, self.ner_model)

        self.chat_history.append({
            "role": "user",
            "message": user_msg,
            "entities": user_ents
        })

        self.chat_history.append({
            "role": "assistant",
            "message": bot_msg,
            "entities": bot_ents
        })

        self.turns_since_last_summary += 1

        if self.should_summarize():
            self.update_summary()

    def should_summarize(self):
        full_text = " ".join([turn["message"] for turn in self.chat_history])
        token_count = count_tokens(full_text)
        return (
            self.turns_since_last_summary >= self.summarize_every_turns
            or token_count >= self.summarize_every_tokens
        )

    def update_summary(self):
        full_text = "\n".join([f"{t['role']}: {t['message']}" for t in self.chat_history])
        new_summary = self.summarize_text(full_text)
        print("\n📝 Summary updated:\n", new_summary, "\n")
        self.summary = f"{self.summary}\n{new_summary}" if self.summary else new_summary
        self.chat_history = []
        self.turns_since_last_summary = 0

    def get_context_for_llm(self, recent_n=2):
        recent_turns = self.chat_history[-recent_n * 2:]
        recent_text = "\n".join([f"{t['role']}: {t['message']}" for t in recent_turns])
        return f"Summary:\n{self.summary}\n\nRecent Turns:\n{recent_text}"


# test session history  🆕

In [None]:
chat_ctx = ChatContextManager()

print("Type '0' to exit, '1' to reset chat history. \n")

while True:
    print("\n🔹🔹🔹🔹🔹🔹\n")
    user_query = input("\n🔹 Input Query:\n").strip()

    if user_query == "0":
        print("\n exiting...")
        break

    if user_query == "1":
        chat_ctx = ChatContextManagerNER()  # reset context manager
        print("\n chat history cleared. \n")
        continue

    retrieved_chunks = query_supabase(user_query)
    context = chat_ctx.get_context_for_llm()
    chat_history = [{"role": "user", "content": context}] if context.strip() else []

    try:
        response, _ = call_openai_llm(user_query, retrieved_chunks, chat_history)
        print("\n 🔹 Chatbot Response:\n", response)
        chat_ctx.add_turn(user_query, response)

    except Exception as e:
        print("ERROR:", e)

# explain the process of medical training