In [4]:
import os
import hashlib
import tempfile
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Document
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.groq import Groq
from llama_index.core.settings import Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
from dotenv import load_dotenv
import pandas as pd


W1021 20:39:44.112000 19280 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [5]:
def initialize_llm():
    """Initialize and return the Groq LLM."""
    load_dotenv()
    api_key = os.getenv("GROQ_API_KEY")
    if not api_key:
        raise ValueError("GROQ_API_KEY not found in .env file")
    os.environ["GROQ_API_KEY"] = api_key
    return Groq(model="llama3-70b-8192", temperature=0.1)

def load_document(file_content, file_name):
    """
    Load a document from file content and return a Document object.
    Modified to handle CSV specifically by converting to markdown table for better RAG.
    
    Args:
        file_content (bytes): Content of the uploaded file.
        file_name (str): Name of the uploaded file.
        
    Returns:
        Document: LlamaIndex Document object.
    """
    if not file_name.lower().endswith('.csv'):
        raise ValueError("Only CSV files are supported for this RAG tool.")
    
    try:
        df = pd.read_csv(file_content)
        markdown_table = df.to_markdown(index=False)
        return Document(text=markdown_table)
    except Exception as e:
        raise ValueError(f"Error processing CSV: {str(e)}")


In [7]:
def build_sentence_window_index(documents, collection_name, sentence_window_size=3):
    """
    Build a sentence window index using Chroma vector store.
    
    Args:
        documents (list): List of Document objects.
        collection_name (str): Name of the Chroma collection.
        sentence_window_size (int): Size of the sentence window.
        
    Returns:
        VectorStoreIndex: LlamaIndex vector store index.
    """
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=sentence_window_size,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
    llm = initialize_llm()
    Settings.llm = llm
    Settings.embed_model = embedding_model
    Settings.node_parser = node_parser
    chroma_client = chromadb.PersistentClient(path="./chroma_db")
    chroma_collection = chroma_client.get_or_create_collection(name=collection_name)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    index = VectorStoreIndex.from_documents(
        documents,
        vector_store=vector_store
    )
    return index

def get_sentence_window_query_engine(index, similarity_top_k=6, rerank_top_n=2):
    """
    Create a query engine for the sentence window index.
    
    Args:
        index (VectorStoreIndex): LlamaIndex vector store index.
        similarity_top_k (int): Number of top similar nodes to retrieve.
        rerank_top_n (int): Number of nodes to rerank.
        
    Returns:
        QueryEngine: LlamaIndex query engine.
    """
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n,
        model="BAAI/bge-reranker-base"
    )
    return index.as_query_engine(
        similarity_top_k=similarity_top_k,
        node_postprocessors=[postproc, rerank]
    )

In [8]:
from flask import Flask, request, jsonify
from rag_tool import process_file_and_get_query_engine

app = Flask(__name__)

query_engine = None

In [12]:
from flask import Flask, request, jsonify
from rag_tool import process_file_and_get_query_engine
import os


In [None]:
csv_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'assets', 'Sugar_Spend_Data.csv'))



try:
    # Read file content as bytes
    with open(csv_path, 'rb') as file:
        file_content = file.read()
    file_name = 'Sugar_Spend_Data.csv'
    
    # Build RAG query engine
    query_engine = process_file_and_get_query_engine(file_content, file_name)
    print("RAG index built successfully")
except Exception as e:
    print(f"Error processing CSV: {str(e)}")

NameError: name '__file__' is not defined