In [8]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py (from rouge-score)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Using cached absl_py-2.1.0-py3-none-any.whl (133 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (pyproject.toml): started
  Building wheel for rouge-score (pyproject.toml): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=25026 sha256=c88203b01d2a21aa853cc05e2613c19b065f761a9b6fe2b28930854dd89e3bc7
  Stored in directory: c:\users\tuf\appdata\local\pip\cache\wheels\85\9d\af\01feefbe7d55ef54687

In [9]:
from sklearn.metrics import accuracy_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import numpy as np

In [2]:
test_data = [
    {
        "query": "Can failing to submit an income tax return lead to prosecution?",
        "solution": "Yes, failing to submit an income tax return can lead to prosecution under the Income Tax Act."
    },
    {
        "query": "What are the income tax rules for a salaried employee having salary less than 10 lakhs?",
        "solution": "Salaried employees earning less than 10 lakhs can avail of tax deductions under Sections 80C, 80D, and other applicable sections."
    },
    {
        "query": "Can I claim tax benefits on two home loans for the same property?",
        "solution": "Yes, you can claim tax benefits on both home loans, provided they meet the conditions specified under Section 24 and Section 80C of the Income Tax Act."
    }
]

In [None]:
import os
from dotenv import load_dotenv
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFacePipeline
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import PromptTemplate

# Load environment variables
load_dotenv()

# Paths from environment variables
DATA_DIR = os.getenv("DATA_DIR", "E:/project/tax_advisor/data")
MODEL_DIR = os.getenv("MODEL_DIR", "E:/project/tax_advisor/model/local_model")
CHROMA_DB_DIR = os.getenv("CHROMA_DB_DIR", "E:/project/tax_advisor/model/chroma_db")

# ✅ Load and process documents
def load_docs(directory):
    loader = DirectoryLoader(directory)
    return loader.load()

docs = load_docs(DATA_DIR)

def split_docs(doc, chunk_size=512, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(doc)

docs = split_docs(docs)

# ✅ Initialize embeddings and vector store
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectordb = Chroma.from_documents(docs, embeddings, persist_directory=CHROMA_DB_DIR, collection_metadata={"hnsw:space": "cosine"})
new_db = Chroma(persist_directory=CHROMA_DB_DIR, embedding_function=embeddings)

def get_similar_docs(query, k=1, score=False):
    return new_db.similarity_search_with_score(query, k=k) if score else new_db.similarity_search(query, k=k)

# ✅ Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# ✅ Use 8-bit quantization to speed up inference & reduce memory
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

print("🚀 Loading Mistral-7B-Instruct model...")

# ✅ Load model ONCE at startup & keep it in memory
model = AutoModelForCausalLM.from_pretrained(MODEL_DIR, quantization_config=bnb_config).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

# ✅ Optimized text generation pipeline
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=False,  
    max_new_tokens=512,  
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="{context}\n\nProvide a clear and concise answer based on India's tax laws:\n{question}\n\nAnswer (India-Specific):"
)


chain = create_stuff_documents_chain(llm, prompt_template)

def get_answer(query):
    similar_docs = get_similar_docs(query)
    answer = chain.invoke({"context": similar_docs, "question": query})
    return answer

  from .autonotebook import tqdm as notebook_tqdm
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


🚀 Loading Mistral-7B-Instruct model...


Device set to use cuda:0


In [10]:
def evaluate_system(test_data):
    predictions = []
    references = []
    bleu_scores = []
    rouge_l_scores = []
    
    smooth = SmoothingFunction().method1  # Smoothing to avoid zero BLEU score
    
    for item in test_data:
        query = item["query"]
        reference = item["solution"]
        
        # Generate response using the RAG pipeline
        generated_response = get_answer(query).strip()
        predictions.append(generated_response)
        references.append(reference)

        print(f"Query: {query}")
        print(f"Generated Response: {generated_response}")
        print(f"Reference: {reference}")
        print("-" * 50)
        
        # BLEU score
        bleu = sentence_bleu([reference.split()], generated_response.split(), smoothing_function=smooth)
        bleu_scores.append(bleu)

        # ROUGE-L score
        rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
        rouge_l = rouge.score(reference, generated_response)["rougeL"].fmeasure
        rouge_l_scores.append(rouge_l)
    
    # Calculate relaxed accuracy using ROUGE-L
    accuracy = np.mean([1 if score > 0.5 else 0 for score in rouge_l_scores])

    avg_bleu_score = np.mean(bleu_scores)
    avg_rouge_l_score = np.mean(rouge_l_scores)

    return accuracy, avg_bleu_score, avg_rouge_l_score

In [11]:
accuracy, avg_bleu_score, avg_rouge_l_score = evaluate_system(test_data)

Query: Can failing to submit an income tax return lead to prosecution?
Generated Response: Yes, failing to submit an income tax return can lead to prosecution under certain circumstances as provided in section 276CC of the Income Tax Act, 1961. However, for initiating prosecution, the following conditions must be satisfied:

1. The total tax payable after deducting TDS should exceed Rs 1 lakh in the relevant assessment year.
2. The assessee has failed to furnish the return of income for three consecutive assessment years.
3. The assessee has also failed to pay the advance tax due during these three years.
4. The amount of unpaid advance tax should be less than Rs 10,000 during each of these three years.

If all these conditions are met, then the Assessing Officer may initiate prosecution against the assessee under section 276CC of the IT Act, 1961. It is essential to note that failure to file returns alone does not attract criminal proceedings under this section. The conditions mention

In [12]:
print(f"Accuracy: {accuracy:.4f}")
print(f"BLEU Score: {avg_bleu_score:.4f}")
print(f"ROUGE-L Score: {avg_rouge_l_score:.4f}")

Accuracy: 0.0000
BLEU Score: 0.0455
ROUGE-L Score: 0.1783


In [6]:
from sklearn.metrics import precision_score, recall_score, f1_score
from langchain_ollama import OllamaEmbeddings
import numpy as np

In [7]:
embeddings = OllamaEmbeddings(model="nomic-embed-text")

In [8]:
ground_truth = {
    "what will happen if i fail to fill itr": "Failure to file ITR may lead to penalties and prosecution.",
    "is aadhar necessary for filing itr": "Aadhaar is mandatory for filing ITR in most cases unless exempted."
}

# Retrieved documents from your vector database (simulate retrieval)
retrieved_docs = {
    "what will happen if i fail to fill itr": [
        "Not filing ITR can attract penalties.",
        "You may face legal consequences for not filing ITR."
    ],
    "is aadhar necessary for filing itr": [
        "Aadhaar is usually required to file ITR.",
        "Without Aadhaar, ITR filing may not be possible."
    ]
}

In [9]:
precision_list = []
recall_list = []
f1_list = []
mrr_list = []
cosine_similarities = []

In [10]:
for query, correct_answer in ground_truth.items():
    retrieved = retrieved_docs.get(query, [])
    if not retrieved:
        continue  # Skip if no retrieval
    
    # Convert text to embeddings
    correct_embedding = embeddings.embed_query(correct_answer)
    retrieved_embeddings = [embeddings.embed_query(doc) for doc in retrieved]
    
    # Compute Cosine Similarity
    similarities = [np.dot(correct_embedding, emb) / (np.linalg.norm(correct_embedding) * np.linalg.norm(emb)) for emb in retrieved_embeddings]
    cosine_similarities.append(np.mean(similarities))
    
    # Rank-based MRR calculation
    sorted_indices = np.argsort(similarities)[::-1]  # Sort in descending order
    for rank, idx in enumerate(sorted_indices, start=1):
        if similarities[idx] > 0.75:  # Assume similarity > 0.75 as relevant
            mrr_list.append(1 / rank)
            break
    else:
        mrr_list.append(0)  # No relevant document found
    
    # Precision, Recall, F1-Score (Binary Relevance: similarity > 0.75 is relevant)
    y_true = [1] * len(retrieved)  # Assume all retrieved should be relevant
    y_pred = [1 if sim > 0.75 else 0 for sim in similarities]
    
    precision_list.append(precision_score(y_true, y_pred, zero_division=0))
    recall_list.append(recall_score(y_true, y_pred, zero_division=0))
    f1_list.append(f1_score(y_true, y_pred, zero_division=0))

In [11]:
print(f"Precision: {np.mean(precision_list):.4f}")
print(f"Recall: {np.mean(recall_list):.4f}")
print(f"F1-Score: {np.mean(f1_list):.4f}")
print(f"MRR: {np.mean(mrr_list):.4f}")
print(f"Cosine Similarity: {np.mean(cosine_similarities):.4f}")

Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
MRR: 1.0000
Cosine Similarity: 0.9122
