In [None]:
!pip install chromadb -q

In [None]:
!pip install langchain langchain-community

In [None]:
!pip install unstructured

In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader
import json
import os
from langchain.schema import Document

txt_dir = "../data/txt"  # Directory for text files
json_dir = "../data/gpt-json"  # Directory for JSON files

# Load text files
def load_txt_docs(txt_dir):
    loader = DirectoryLoader(txt_dir, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"})
    return loader.load()

def load_json_docs(json_dir):
    docs = []
    for filename in os.listdir(json_dir):
        if filename.endswith(".json"):
            file_path = os.path.join(json_dir, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                json_data = json.load(f)

            for entry in json_data:
                query = entry.get("Query", "").strip()
                solution = entry.get("Solution", "").strip()
                source = entry.get("source", filename)  # Use filename if "source" is missing
                keywords = entry.get("keywords", [])  # Extract keywords list
                
                # Convert list of keywords to a comma-separated string (fix for ChromaDB)
                keywords_str = ", ".join(keywords) if isinstance(keywords, list) else ""

                # Log missing keywords
                if not keywords:
                    print(f"⚠️ Warning: No keywords found for query: \"{query}\" in {source}")

                # Combine query & solution into one document
                content = f"Query: {query}\nSolution: {solution}"

                # Append document with metadata including keywords as a string
                docs.append(Document(
                    page_content=content,
                    metadata={"source": source, "keywords": keywords_str}  # Fix applied
                ))
    return docs


# Load both text and JSON files
txt_docs = load_txt_docs(txt_dir)
json_docs = load_json_docs(json_dir)

# Combine all documents
all_docs = txt_docs + json_docs

# Print total count
print(f"Total documents loaded: {len(all_docs)}\n")

In [None]:
# Print all loaded documents
for i, doc in enumerate(all_docs, 1):
    print(f"Document {i} (Source: {doc.metadata['source']}):\nKeywords: {doc.metadata.get('keywords', [])}\n{doc.page_content}\n{'-'*80}\n")

In [None]:
from langchain.schema import Document

def split_documents_by_qna(docs, max_chunk_size=2048):
    """
    Splits documents into smaller chunks while ensuring each 'Query & Solution' pair remains intact.
    Dynamically adjusts chunking to prevent sentence truncation.
    """
    split_chunks = []

    for doc in docs:
        content = doc.page_content.strip()
        metadata = doc.metadata.copy()  # Copy metadata to prevent mutation issues

        # Ensure "Query: " formatting correctly without duplication
        if not content.startswith("Query: "):
            content = "Query: " + content  # Add only if missing

        # Split into individual Q&A pairs using '\nQuery: ' as the separator
        qna_pairs = content.split("\nQuery: ")  
        qna_pairs = [pair.strip() for pair in qna_pairs if pair.strip()]  # Ensure proper formatting

        current_chunk = ""

        for qna in qna_pairs:
            # Make sure each chunk starts with "Query: " exactly once
            if not qna.startswith("Query: "):
                qna = "Query: " + qna  # Fix edge cases where it's missing after splitting

            if len(current_chunk) + len(qna) > max_chunk_size:
                # Save the current chunk before starting a new one
                split_chunks.append(Document(page_content=current_chunk.strip(), metadata=metadata))
                current_chunk = ""

            # If a single Q&A is too big, store it separately
            if len(qna) > max_chunk_size:
                split_chunks.append(Document(page_content=qna.strip(), metadata=metadata))
            else:
                current_chunk += qna + "\n\n"  # Append Q&A pair with spacing

        # Add last chunk if not empty
        if current_chunk.strip():
            split_chunks.append(Document(page_content=current_chunk.strip(), metadata=metadata))

    return split_chunks

# Apply structured splitting
docs = split_documents_by_qna(all_docs)

print(f"✅ Total chunks after structured splitting: {len(docs)}\n")
print(f"Example Chunk:\n{docs[1].page_content}\n")
print(f"Example Last Chunk:\n{docs[-1].page_content}\n")


In [None]:
!pip install -U langchain-ollama

In [None]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="nomic-embed-text")
#.

In [None]:
#creating vector db
from langchain_chroma import Chroma
db = Chroma.from_documents(docs, embeddings)
#.

In [None]:
query = "what will happen if i fail to fill itr"
matching_docs = db.similarity_search(query)
#.

In [None]:
matching_docs
#.

In [None]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
#.

In [None]:
retriever
#.

In [None]:
persist_directory = "../model/taxadvisordb_v1-0"

vectordb = Chroma.from_documents(
    docs,
    embeddings,
    persist_directory=persist_directory
)

#.

In [None]:
new_db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
#.

In [None]:
matching_docs = new_db.similarity_search_with_score(query,k=3)

matching_docs

#.

In [None]:
query = "what will happen if i fail to fill itr"
similar_docs = new_db.similarity_search(query, k=1)

for doc in similar_docs:
    print(doc.page_content)

#.