In [None]:
# import numpy as np
# import faiss
# from elasticsearch import Elasticsearch
# from elasticsearch_dsl import Search, Q

# # Assuming documents are preprocessed into a list of dicts with 'id', 'text', and 'embedding'
# documents = [
#     {'id': '1', 'text': 'example text', 'embedding': np.random.rand(768).astype('float32')}
# ]

# # Set up Vector Index with Faiss

# dimension = 768  # Dimension of embeddings
# vector_index = faiss.IndexFlatL2(dimension)
# vector_index.add(np.array([doc['embedding'] for doc in documents]))

# # Set up Elasticsearch for BM25
# es = Elasticsearch("http://localhost:9200")
# # Assuming Elasticsearch index 'documents' is set up with mappings appropriate for text

# for doc in documents:
#     es.index(index='documents', id=doc['id'], body={'text': doc['text']})


In [None]:
# def search(query_embedding, query_text, top_k=10):
#     # Vector search with Faiss
#     distances, indices = vector_index.search(np.array([query_embedding]), top_k)
    
#     # Text search with Elasticsearch
#     s = Search(using=es, index='documents').query("match", text=query_text)
#     response = s.execute()

#     return indices[0], response


In [None]:
# def reciprocal_rank_fusion(vector_results, text_results):
#     rank_scores = {}
#     for rank, idx in enumerate(vector_results):
#         doc_id = documents[idx]['id']
#         rank_scores[doc_id] = rank_scores.get(doc_id, 0) + 1 / (rank + 1)
    
#     for rank, hit in enumerate(text_results):
#         doc_id = hit.meta.id
#         rank_scores[doc_id] = rank_scores.get(doc_id, 0) + 1 / (rank + 1)
    
#     # Sort by combined rank score
#     sorted_docs = sorted(rank_scores.items(), key=lambda item: item[1], reverse=True)
#     return [doc_id for doc_id, _ in sorted_docs[:top_k]]


In [None]:

from llama_index.core import SimpleDirectoryReader



import numpy as np
import os
import faiss
import sqlite3
from transformers import pipeline, DistilBertTokenizer
import torch

# Setup directory and transformer model for embedding generation
transcript_directory =  "/mnt/c/Users/edeep/RAG/RAG_Codebase/project3_se-final_with_openai/project3_se-final_with_openai/"
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
feature_extractor = pipeline('feature-extraction', model='distilbert-base-cased')

# Define a Document class with necessary methods
class Document:
    def __init__(self, id_, text, embedding):
        self.id_ = id_
        self.text = text
        self.embedding = embedding

    def get_doc_id(self):
        return self.id_

    def hash(self):
        return hash((self.id_, self.text))  # Example hash function based on id and text

# Function to read and process files
def process_files(directory):

    reader = SimpleDirectoryReader(directory, filename_as_id=True, recursive=True )
    docs = reader.load_data()
    for doc in docs:
        filename = doc.id_
        content = doc.text
        inputs = tokenizer(content, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = feature_extractor.model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        # doc.embedding = embedding.astype('float32')
        document = Document(filename, content[:1024], embedding.astype('float32'))
        documents.append(document)
    return documents , docs

# Load and process documents
documents , docs = process_files(transcript_directory)

# Set up Vector Index with Faiss
dimension = 768  # Dimension of embeddings
vector_index = faiss.IndexFlatL2(dimension)
vector_index.add(np.array([doc.embedding for doc in documents]))

# Set up SQLite Database with Full Text Search (FTS5)
conn = sqlite3.connect('example.db')
c = conn.cursor()
c.execute('''
CREATE VIRTUAL TABLE IF NOT EXISTS documents USING fts5(id, text)
''')
for doc in documents:
    c.execute("INSERT INTO documents (id, text) VALUES (?, ?)", (doc.get_doc_id(), doc.text))
conn.commit()

def search(query_embedding, query_text, top_k=10):
    # Vector search with Faiss
    _, indices = vector_index.search(np.array([query_embedding]), top_k)
    
    # Text search with SQLite FTS
    query = f"SELECT id FROM documents WHERE documents MATCH '{query_text}'"
    c.execute(query)
    text_results = c.fetchall()

    return indices[0], [res[0] for res in text_results]

def reciprocal_rank_fusion(vector_results, text_results, top_k=10):
    rank_scores = {}
    for rank, idx in enumerate(vector_results):
        doc_id = documents[idx].get_doc_id()
        rank_scores[doc_id] = rank_scores.get(doc_id, 0) + 1 / (rank + 1)
    
    for rank, doc_id in enumerate(text_results):
        rank_scores[doc_id] = rank_scores.get(doc_id, 0) + 1 / (rank + 1)
    
    sorted_docs = sorted(rank_scores.items(), key=lambda item: item[1], reverse=True)
    # return [doc_id for doc_id, _ in sorted_docs[:top_k]]
    return sorted_docs[:top_k]

# Example usage

query_text = "What does vendor service do"
query_embedding = np.mean(feature_extractor(query_text), axis=1)[0]
vector_results, text_results = search(query_embedding, query_text, top_k=5)
final_results = reciprocal_rank_fusion(vector_results, text_results, top_k=5)
print("Final combined results:", final_results)

# Close the SQLite connection
conn.close()



In [None]:
from llama_index.core import VectorStoreIndex, PromptTemplate
from llama_index.llms.ollama import Ollama  
import os
from langchain.embeddings import HuggingFaceEmbeddings  # Correct class from HuggingFace
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core import Settings, VectorStoreIndex, PromptTemplate, SimpleDirectoryReader
from llama_index.llms.ollama import Ollama  # Assuming you corrected the import here

# find docs based on doc_id
def find_doc(doc_id, docs):
    for doc in docs:
        if doc.id_ == doc_id:
            return doc
    return None

# find corresponding docs of sorted doc_ids
def find_docs(doc_ids, docs):
    return [find_doc(doc_id, docs) for doc_id in doc_ids]


relevant_docs = find_docs([doc_id for doc_id, _ in final_results], docs)

print("relevant_docs")
print(relevant_docs)

# Create query pipeline 
# create index of concatenated content
# search index for query
# return relevant content
# Indexing documents
# Load the embedding model
def load_embedding_model(model_name="sentence-transformers/all-mpnet-base-v2", device="cuda"):
    model_kwargs = {"device": device}
    encode_kwargs = {"normalize_embeddings": True}
    return HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

lc_embedding_model = load_embedding_model()
embed_model = LangchainEmbedding(lc_embedding_model)
Settings.embed_model = embed_model
query = "Explain what vendor service does"

index = VectorStoreIndex.from_documents(relevant_docs)

# Setting up LLM and querying capabilities
llm = Ollama(model="mistral", request_timeout=60.0)
Settings.llm = llm
query_engine = index.as_query_engine(streaming=True, similarity_top_k=4)

# Template for queries
qa_prompt_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information above, I want you to think step by step to answer the query in a crisp manner, in case you don't know the answer say 'I don't know!'.\n"
    "Query: {query_str}\n"
    "Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
query_engine.update_prompts({"response_synthesizer:text_qa_template": qa_prompt_tmpl})

# Querying the index
response = query_engine.query(query)
print(response)