In [1]:
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch
import hnswlib
from transformers import AutoModel

if os.path.isfile("rag_instruct.json"): 
    df = pd.read_json("rag_instruct.json")
else:
    df = pd.read_json("hf://datasets/FreedomIntelligence/RAG-Instruct/rag_instruct.json")

documents = df['documents']

In [2]:
semb_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
xenc_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
semb_model.to('cuda')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [3]:
#corpus_embeddings = semb_model.encode(documents, convert_to_tensor=True, show_progress_bar=True)


In [4]:
#size_corpus = len(corpus_embeddings)
#print("Size of corpus: ", size_corpus)
index = hnswlib.Index(space='cosine', dim=40541)

In [5]:
# Define hnswlib index path
index_path = "./hnswlib.index"

# Load index if available
if os.path.exists(index_path):
    print("Loading index...")
    index.load_index(index_path)
# Else index data collection
else:
    # Initialise the index
    print("Start creating HNSWLIB index")
    index.init_index(max_elements=corpus_embeddings.size(0), ef_construction=400, M=64)
    #  Compute the HNSWLIB index (it may take a while)
    index.add_items(corpus_embeddings.cpu(), list(range(len(corpus_embeddings))))
    # Save the index to a file for future loading
    print("Saving index to:", index_path)
    index.save_index(index_path)

Loading index...


In [6]:
# function to get the related docs
def get_related_docs(query, k=3):
    query_embedding = semb_model.encode(query, convert_to_tensor=True)
    corpus_ids, _ = index.knn_query(query_embedding.cpu(), k=k)

    model_inputs = [(query, str(documents[idx])) for idx in corpus_ids[0]]
    cross_scores = xenc_model.predict(model_inputs)
    send_to_LLM = ""
    positive_docs = [documents[corpus_ids[0][idx]] for idx in np.argsort(-cross_scores) if cross_scores[idx] > 0]

    if len(positive_docs) > 1:
        for i, doc in enumerate(positive_docs):
            send_to_LLM += f"Document {i+1}:\n\n"
            # Convert the list 'doc' to a string before concatenating
            send_to_LLM += str(doc) + "\n"
    elif len(positive_docs) == 1:
        # Convert the list to a string if there's only one document
        send_to_LLM = str(positive_docs[0])

    else:
        # If no positive scores, take the top 2 negative scores
        negative_docs = []
        for idx in np.argsort(-cross_scores)[:2]: # Take the top 2 indices based on sorted scores
            negative_docs.append(documents[corpus_ids[0][idx]])

        if len(negative_docs) > 1:
            for i, doc in enumerate(negative_docs):
                send_to_LLM += f"Document {i+1}:\n"
                send_to_LLM += str(doc) + "\n\n"
        elif len(negative_docs) == 1:
            send_to_LLM = str(negative_docs[0])

    return send_to_LLM



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # or bfloat16 if supported
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load model with quantization
llm_model = AutoModelForCausalLM.from_pretrained(
    "AITeamVN/Vi-Qwen2-3B-RAG",
    quantization_config=quant_config,
    device_map="auto"
)

llm_tokenizer = AutoTokenizer.from_pretrained("AITeamVN/Vi-Qwen2-3B-RAG")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

: 

In [None]:
query = "Do all plants do photosynthesis?"

context_docs = get_related_docs(query)

prompt = f"Given this context: \n{context_docs} \n\nPlease answer the question: {query}.\n\nAnswer:\n"

inputs = tokenizer(prompt, return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs["input_ids"].to(model.device),
        attention_mask=inputs["attention_mask"].to(model.device),
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

# Decode and print result
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n=== Generated Answer ===\n")
print(answer.split("Answer:")[-1].strip())  # Optional: strip prompt parts

In [None]:
def generate_response(query):
    context_docs = get_related_docs(query)

    prompt = f"Given this context: \n{context_docs} \n\nPlease answer the question: {query}.\n\nAnswer:\n"

    inputs = llm_tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        outputs = llm_model.generate(
            input_ids=inputs["input_ids"].to(llm_model.device),
            attention_mask=inputs["attention_mask"].to(llm_model.device),
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=llm_tokenizer.eos_token_id
        )

    # Decode and print result
    answer = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\n=== Generated Answer ===\n")
    return answer.split("Answer:")[-1].strip()

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # or bfloat16 if supported
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    "AITeamVN/Vi-Qwen2-3B-RAG",
    quantization_config=quant_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("AITeamVN/Vi-Qwen2-3B-RAG")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import sounddevice as sd
import numpy as np
import webrtcvad
import whisper
import collections
import struct
import re
from transformers import WhisperForConditionalGeneration, WhisperProcessor, BitsAndBytesConfig
import torch

SAMPLE_RATE = 16000
FRAME_DURATION_MS = 30  # ms
FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION_MS / 1000)
CHANNELS = 1
VAD_AGGRESSIVENESS = 2  # 0–3: higher = more aggressive
MAX_SILENCE_SECONDS = 1.0

# === Load Whisper ===
model_id = "openai/whisper-base"  # You can also try "small", "medium", etc.

processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quant_config
)

# === Setup VAD ===
vad = webrtcvad.Vad(VAD_AGGRESSIVENESS) 



In [None]:
GOODBYE_PATTERNS = [
    r"\b(bye|goodbye|see you|exit|quit|farewell)\b",
    r"talk to you later",
    r"that's all",
    r"that's it"
]

def float32_to_int16(audio):
    return (audio * 32767).astype(np.int16)

def is_speech(frame_bytes):
    return vad.is_speech(frame_bytes, SAMPLE_RATE)

def is_goodbye(text):
    for pattern in GOODBYE_PATTERNS:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False

def record_until_silence():
    print("Listening...")

    buffer = []
    silence_buffer = collections.deque(maxlen=int(MAX_SILENCE_SECONDS * 1000 / FRAME_DURATION_MS))
    stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS, dtype='float32', blocksize=FRAME_SIZE)

    with stream:
        while True:
            audio_chunk, _ = stream.read(FRAME_SIZE)
            audio_chunk = audio_chunk.flatten()
            audio_int16 = float32_to_int16(audio_chunk)
            frame_bytes = struct.pack(f"{len(audio_int16)}h", *audio_int16)

            if is_speech(frame_bytes):
                buffer.append(audio_chunk)
                silence_buffer.clear()
            else:
                silence_buffer.append(audio_chunk)
                if len(silence_buffer) == silence_buffer.maxlen and len(buffer) > 0:
                    print("Silence detected, stopping...")
                    break

    full_audio = np.concatenate(buffer)
    return full_audio


def transcribe_audio_array(audio_array: np.ndarray, sampling_rate: int):
    if audio_array.ndim > 1:
        audio_array = audio_array.mean(axis=1)  # convert stereo to mono

    # Convert to tensor
    waveform = torch.tensor(audio_array, dtype=torch.float32)

    # Resample if needed
    if sampling_rate != 16000:
        import torchaudio
        waveform = torchaudio.functional.resample(waveform, orig_freq=sampling_rate, new_freq=16000)

    # Whisper expects float32 here — processor handles float32, model input will be auto-cast later
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
    input_features = inputs.input_features.to(model.device, dtype=torch.float16)  # ✅ cast to float16

    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription


# === Conversation Loop ===
def start_conversation():
    generated_answer = generate_response("what is photosinthesis?")
    print(f"AI: {generated_answer}")
    return
    print("Start speaking. Say 'goodbye' to end the conversation.")
    while True:
        audio = record_until_silence()
        if len(audio) == 0:
            continue  # skip empty audio

        print("Transcribing...")
        
        # Decode
        text = transcribe_audio_array(audio, sampling_rate=SAMPLE_RATE)
        if is_goodbye(text):
            print("Goodbye detected. Ending conversation.")
            break

        generated_answer = generate_response(text)
        print(f"AI: {generated_answer}")



: 

In [None]:
start_conversation()