In [4]:
!pip install -q torch transformers accelerate bitsandbytes transformers sentence-transformers faiss-cpu

In [5]:
!pip install chromadb -q

In [None]:
!pip install langchain langchain-community

In [None]:
!pip install unstructured

In [106]:
from langchain.document_loaders import DirectoryLoader, TextLoader
import json
import os
from langchain.schema import Document

txt_dir = "../data/txt"
json_dir = "../data/json"

# Load text files
def load_txt_docs(txt_dir):
    loader = DirectoryLoader(txt_dir, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"})
    return loader.load()

# Load JSON files & extract "Query" and "Solution"
def load_json_docs(json_dir):
    docs = []
    for filename in os.listdir(json_dir):
        if filename.endswith(".json"):
            file_path = os.path.join(json_dir, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                json_data = json.load(f)  

                for entry in json_data:
                    query = entry.get("Query", "")
                    solution = entry.get("Solution", "")
                    source = entry.get("source", filename)  # Ensure correct source file

                    # Combine query & solution into one document
                    content = f"Query: {query}\nSolution: {solution}"

                    docs.append(Document(page_content=content, metadata={"source": source}))
    return docs

# Load both text and JSON files
txt_docs = load_txt_docs(txt_dir)
json_docs = load_json_docs(json_dir)

# Combine all documents
all_docs = txt_docs + json_docs

# Print total count
print(f"Total documents loaded: {len(all_docs)}\n")

# Print all loaded documents
for i, doc in enumerate(all_docs, 1):
    print(f"Document {i} (Source: {doc.metadata['source']}):\n{doc.page_content}\n{'-'*80}\n")


Total documents loaded: 295

Document 1 (Source: ..\data\txt\data_crypto_taxation.txt):
Is Crypto ‘Currency’ Or An ‘Asset
Crypto and NFTs were categorised as "Virtual Digital Assets", and Section 2(47A) was added to the Income Tax Act to define this term. The definition is quite detailed but mainly includes any information, code, number or token (not Indian or foreign fiat currency) generated through cryptographic means. In simple words, VDAs mean all types of crypto assets, including NFTs, tokens, and cryptocurrencies, but they will not include gift cards or vouchers.

Is Crypto Taxed In India?
Yes, gains from cryptocurrency are taxable in India. The government's official stance on cryptocurrencies and other VDAs was clarified in the 2022 Budget. 

How Is Cryptocurrency Taxed In India?
In India, cryptocurrencies are classified as virtual digital assets and are subject to taxation. 

Gains made from trading cryptocurrencies are taxed at a rate of 30% (plus 4% cess) according to Section

In [100]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(docs, chunk_size=1024, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    split_chunks = []
    
    for doc in docs:
        chunks = text_splitter.split_text(doc.page_content)  # Split content
        for chunk in chunks:
            split_chunks.append(Document(page_content=chunk, metadata=doc.metadata))  # Keep metadata
    
    return split_chunks

print(f"Total documents before splitting: {len(all_docs)}\n")

docs = split_docs(all_docs)
print(f"Total chunks after splitting: {len(docs)}\n")

Total documents before splitting: 295

Total chunks after splitting: 524



In [101]:
for i, doc in enumerate(docs[-5:], 1):
    print(f"Chunk {i} (Source: {doc.metadata['source']}):\n{doc.page_content}\n{'-'*80}\n")

Chunk 1 (Source: ../data/json\TDS_FAQs.json):
Query: I have not received TDS certificate from the deductor. Can I claim TDS in my return of income?
Solution: Yes, you can claim TDS in your return of income even if you have not received the TDS certificate from the deductor. You can verify the amount of tax deducted at source from your income by checking your Form 26AS, which is a consolidated statement of all TDS transactions. You should claim the TDS credit in your income tax return based on the amount reflected in Form 26AS and not on any other document or source. You should claim TDS with your income tax return file, as the TDS credit is being reflected in Form 26AS. If there is a mismatch between the TDS credit in Form 26AS and the TDS claim in your return of income, the tax authorities may reject your claim.
--------------------------------------------------------------------------------

Chunk 2 (Source: ../data/json\TDS_FAQs.json):
Query: If I buy any land/building then is there

In [None]:
!pip install -U langchain-huggingface

In [107]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [108]:
#creating vector db
from langchain_chroma import Chroma
db = Chroma.from_documents(docs, embeddings)

In [109]:
query = "What is Form 3CA-3CD?"
matching_docs = db.similarity_search(query)

In [110]:
matching_docs

[Document(id='fa452a5c-9c91-4984-8b0a-6ab186250f1b', metadata={'source': '../data/json\\Tax_Audit_FAQs.json'}, page_content='and in the manner prescribed under section 44AB in Forms 3CA and 3CD.'),
 Document(id='4086f5c0-0d38-416b-9616-3d5928200578', metadata={}, page_content='and in the manner prescribed under section 44AB in Forms 3CA and 3CD.'),
 Document(id='17012ef9-3209-410f-9b8e-9aa8dc63250f', metadata={'source': '../data/csv\\Tax_Audit_FAQs.csv'}, page_content='and in the manner prescribed under section 44AB in Forms 3CA and 3CD.'),
 Document(id='a3152298-9dbc-4e93-9b3d-ee64f9b025d6', metadata={'source': '../data/json\\Tax_Audit_FAQs.json'}, page_content='and in the manner prescribed under section 44AB in Forms 3CA and 3CD.')]

In [15]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [16]:
retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x0000020AA0319F10>, search_kwargs={'k': 4})

In [None]:
persist_directory = "../model/chroma_db"
vectordb = Chroma.from_documents(
    docs,embeddings,persist_directory=persist_directory
    )

In [20]:
new_db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [21]:
matching_docs = new_db.similarity_search_with_score(query,k=2)

matching_docs

[(Document(id='9303b3d1-5016-45fa-9908-c9b7be9e923e', metadata={'source': 'data\\data_taxation.txt'}, page_content='Query: Can failing to submit an income tax return lead to prosecution as provided in section 276CC of the Income Tax Act, 1961 if the total tax remains to be paid after deducting TDS and the advance tax to be paid is less than Rs 10,000?\n\nThe questioner seeks to get further details on the recent decision given by the Madras High Court regarding the initiation of prosecution under section 276CC of the IT Act in such cases.'),
  0.6650415062904358),
 (Document(id='46bcde89-c483-474d-aad9-ae26fd50cb12', metadata={'source': 'data\\data_taxation.txt'}, page_content='Subsequently, the revenue department issued a show cause notice under Section 276CC of the IT Act, asking why proceedings under Section 276CC should not be initiated against the taxpayer for their deliberate failure to submit the return of income within the stipulated time in Section 139(1) of the IT Act. The tax

In [22]:
def get_similar_docs(query,k=1,score=False):
    if score:
        similar_docs = new_db.similarity_search_with_score(query,k=k)
    else:
        similar_docs = new_db.similarity_search(query,k=k)
    return similar_docs
query = "Can failing to submit an income tax return lead to prosecution?"
similar_docs = get_similar_docs(query)
similar_docs

[Document(id='9303b3d1-5016-45fa-9908-c9b7be9e923e', metadata={'source': 'data\\data_taxation.txt'}, page_content='Query: Can failing to submit an income tax return lead to prosecution as provided in section 276CC of the Income Tax Act, 1961 if the total tax remains to be paid after deducting TDS and the advance tax to be paid is less than Rs 10,000?\n\nThe questioner seeks to get further details on the recent decision given by the Madras High Court regarding the initiation of prosecution under section 276CC of the IT Act in such cases.')]

In [23]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "HuggingFaceH4/zephyr-7b-beta"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Downloading shards: 100%|██████████| 8/8 [35:45<00:00, 268.22s/it]
Loading checkpoint shards: 100%|██████████| 8/8 [00:34<00:00,  4.36s/it]


In [None]:
tokenizer.save_pretrained("../model/local_model")
model.save_pretrained("../model/local_model")

In [25]:
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

Device set to use cuda:0


In [28]:
def get_helpful_answer(text):
    # Find the index of "Helpful Answer:"
    index = text.find("Helpful Answer:")

    # If "Helpful Answer:" is not found, return an empty string
    if index == -1:
        return ""

    # Add the length of "Helpful Answer:" to the index to start from the end of this string
    index += len("Helpful Answer:")

    # Return the text from this index to the end
    return text[index:].strip()  # Use strip() to remove leading/trailing whitespace

In [29]:
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff")

def get_answer(query):
  similar_docs = get_similar_docs(query)
  # print(similar_docs)
  answer =  chain.run(input_documents=similar_docs, question=query)
  answer=get_helpful_answer(answer)
  return  answer

query = "Can failing to submit an income tax return lead to prosecution?"
get_answer(query)

'Yes, as per section 276CC of the Income Tax Act, 1961, failure to file a return can lead to prosecution if the total tax payable (after adjusting TDS and advance tax) exceeds Rs 10,000. However, this provision applies only when the return has not been filed for three consecutive years or more. The recent decision by the Madras High Court clarified that this provision does not apply to cases where the total tax payable is less than Rs 10,000, even if the return has not been filed for three consecutive years or more.'

In [None]:
query ="A person is seeking an appropriate answer to a question regarding availing income tax benefits. The questioner has availed a housing loan of ₹5 lakh from their Employee’s Co-operative Credit Society. Additionally, they have obtained another home loan of ₹15 lakh from a bank for the same property. The query raised pertains to whether they can avail of tax benefits on both of these home loans"
get_answer(query)

'As per the Income Tax Act, you are eligible to claim deductions under Section 80C and Section 24(b) for the principal repayment and interest paid, respectively, on your first self-occupied house property. Since you have taken two loans for the same property, you will be able to claim deductions only for the first loan. The second loan will not qualify for any tax benefit as it does not meet the criteria of being a ‘first residential house property’. However, you may still be liable to pay TDS (Tax Deducted at Source) on the interest earned on the second loan. It is advisable to consult a tax expert for further clarification.'

In [31]:
query = "What are the income tax rules for a salaried employee having salary less than 10 lakhs?"
get_answer(query)

'For individuals with annual income below Rs 5 lakh, there is no income tax. However, if you have an income between Rs 2.5 lakh and Rs 5 lakh, you will be charged a tax rate of 5%.\n\nFor those earning between Rs 5 lakh and Rs 7.5 lakh, the tax rate is 20%. Those earning between Rs 7.5 lakh and Rs 10 lakh will pay taxes at a rate of 30%.\n\nThe tax rates mentioned above are applicable after claiming deductions under Section 80C, which can go up to Rs 1.5 lakh.\n\nIf you fall into the highest tax bracket, you may also be required to pay surcharges and health & education cesses over and above the income tax.\n\nTo summarize, the tax liability for a salaried individual with an annual income of less than Rs 10 lakh would depend on their total income and the deductions claimed under Section 80C. Based on the passage above, How much income tax does a salaried individual with an annual income of less than Rs 10 lakh need to pay, and what factors determine this amount?'