In [1]:
import numpy as np
import pandas as pd
from faker import Faker
import random

fake = Faker()
np.random.seed(42)
random.seed(42)

num_users = 10

def generate_financial_data(num_users):
    data = {
        'User_ID': [i for i in range(1, num_users + 1)],
        'Income': np.random.uniform(200000, 2000000, num_users).round(2),
        'Expenses': np.random.uniform(10000, 70000, num_users).round(2),
        'HealthInsurance': np.random.uniform(0, 40000, num_users).round(2),
        'HomeLoan': np.random.uniform(0, 10000000, num_users).round(2),
        'ELSS': np.random.uniform(0, 120000, num_users).round(2),
        'NPS': np.random.uniform(0, 120000, num_users).round(2),
        'PPF': np.random.uniform(0, 120000, num_users).round(2),
        'HouseRent': np.random.uniform(0, 500000, num_users).round(2),
        'Previous_Tax_Amount': np.random.uniform(0, 600000, num_users).round(2),
        'State': [fake.state_abbr() for _ in range(num_users)],
        'Filing_Status': [random.choice(['Single', 'Married', 'Head of Household']) for _ in range(num_users)],
        'Tax_Credits': np.random.uniform(0, 150000, num_users).round(2)
    }

    for column in ['HealthInsurance', 'HomeLoan', 'ELSS', 'NPS', 'PPF', 'HouseRent']:
        data[column] = [value if random.random() > 0.5 else 0 for value in data[column]]

    df = pd.DataFrame(data)
    return df

financial_data = generate_financial_data(num_users)
financial_data

Unnamed: 0,User_ID,Income,Expenses,HealthInsurance,HomeLoan,ELSS,NPS,PPF,HouseRent,Previous_Tax_Amount,State,Filing_Status,Tax_Credits
0,1,874172.21,11235.07,24474.12,6075448.52,0.0,116350.16,0.0,386122.38,517862.06,AR,Head of Household,17939.14
1,2,1911285.75,68194.59,5579.75,0.0,0.0,93015.94,0.0,0.0,373978.88,NV,Single,106986.72
2,3,1517589.1,59946.56,0.0,650515.93,0.0,112739.87,0.0,2761.06,198538.81,MO,Single,114117.76
3,4,1277585.27,22740.35,0.0,9488855.37,109118.45,107379.28,42810.4,0.0,38135.01,VT,Head of Household,84191.58
4,5,480833.55,20909.5,0.0,0.0,31053.6,71748.0,0.0,0.0,186589.39,GA,Married,115645.08
5,6,480790.14,21004.27,0.0,8083973.48,79502.67,110624.91,0.0,364503.58,195109.99,PA,Single,74069.34
6,7,304550.5,28254.53,7986.95,3046137.69,37405.33,0.0,0.0,385635.17,437763.71,AL,Single,78409.92
7,8,1759117.06,41485.39,0.0,0.0,62408.16,0.0,0.0,37022.33,382534.48,MA,Single,64131.15
8,9,1282007.02,35916.7,0.0,0.0,65605.23,0.0,8946.08,179232.86,532327.65,UT,Head of Household,3812.87
9,10,1474530.64,27473.75,1858.02,4401524.94,0.0,0.0,118426.43,57934.53,283328.96,CA,Single,16183.71


In [2]:
import pandas as pd

def generate_tax_regulations():
    tax_brackets = ['0% - 0 to 3,00,000', '5% - 3,00,001 to 7,00,000', '10% - 7,00,001 to 10,00,000',
                    '15% - 10,00,001 to 12,00,000', '20% - 12,00,001 to 15,00,000', '30% - 15,00,001 and above']
    standard_deductions = 75000
    tax_credits = [30000, 60000, 90000, 120000,150000,170000]

    regulations = {
        'Tax_Bracket': tax_brackets,
        'Standard_Deductions': standard_deductions,
        'Tax_Credits': tax_credits
    }
    df = pd.DataFrame(regulations)
    return df

tax_regulations = generate_tax_regulations()
tax_regulations

Unnamed: 0,Tax_Bracket,Standard_Deductions,Tax_Credits
0,"0% - 0 to 3,00,000",75000,30000
1,"5% - 3,00,001 to 7,00,000",75000,60000
2,"10% - 7,00,001 to 10,00,000",75000,90000
3,"15% - 10,00,001 to 12,00,000",75000,120000
4,"20% - 12,00,001 to 15,00,000",75000,150000
5,"30% - 15,00,001 and above",75000,170000


In [3]:
# Apply tax regulations to the financial data
def apply_tax_regulations(financial_df, regulations_df):
    # Simplified model for applying tax brackets and deductions
    def calculate_tax(user_income, deductions, standard_deductions):
        # Determine tax rate based on income
        if user_income <= 300000:
            tax_rate = 0.0
        elif user_income <= 700000:
            tax_rate = 0.05
        elif user_income <= 1000000:
            tax_rate = 0.1
        elif user_income <= 1200000:
            tax_rate = 0.15
        elif user_income <= 1500000:
            tax_rate = 0.20
        else:
            tax_rate = 0.30

        # Assuming standard deduction applies regardless of filing status
        standard_deduction = standard_deductions
        taxable_income = max(user_income - deductions - standard_deduction, 0)
        return taxable_income * tax_rate

    # Assuming we use the first row of the regulations_df for simplicity
    standard_deductions = regulations_df['Standard_Deductions'].iloc[0]

    # Calculate estimated tax for each user
    financial_df['Estimated_Tax'] = financial_df.apply(
        lambda row: calculate_tax(row['Income'], row[['HealthInsurance', 'HomeLoan', 'ELSS', 'NPS', 'PPF', 'HouseRent']].sum(), standard_deductions),
        axis=1
    )
    return financial_df

# Generate fake financial data
num_users = 1000
financial_data = generate_financial_data(num_users)

# Apply tax regulations to the financial data
financial_data_with_taxes = apply_tax_regulations(financial_data, tax_regulations)
financial_data_with_taxes.head()

Unnamed: 0,User_ID,Income,Expenses,HealthInsurance,HomeLoan,ELSS,NPS,PPF,HouseRent,Previous_Tax_Amount,State,Filing_Status,Tax_Credits,Estimated_Tax
0,1,256572.53,13470.56,36603.6,926254.83,75765.04,53889.42,0.0,0.0,489736.71,OK,Single,86460.83,0.0
1,2,1345538.74,68146.16,0.0,0.0,0.0,0.0,86722.03,100699.55,592957.27,NV,Single,118800.47,216623.432
2,3,765840.77,63027.15,6318.19,9145486.64,65777.42,68508.85,38307.04,0.0,233310.9,IL,Single,48647.05,0.0
3,4,1115427.24,65665.14,27835.96,0.0,0.0,39891.2,108709.4,0.0,28449.38,IN,Married,107424.68,129598.602
4,5,1833619.65,69694.47,0.0,2587116.42,0.0,0.0,0.0,0.0,480702.9,MN,Married,20312.83,0.0


In [5]:
import pandas as pd
from langchain.docstore.document import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Prepare documents for LangChain
documents = []
for _, row in financial_data_with_taxes.iterrows():
    content = (f"User_ID: {row['User_ID']}, Income: {row['Income']}, Expenses: {row['Expenses']}, "
               f"HealthInsurance: {row['HealthInsurance']}, HomeLoan: {row['HomeLoan']}, "
               f"ELSS: {row['ELSS']}, NPS: {row['NPS']}, PPF: {row['PPF']}, HouseRent: {row['HouseRent']}, "
               f"Previous_Tax_Amount: {row['Previous_Tax_Amount']}, State: {row['State']}, "
               f"Filing_Status: {row['Filing_Status']}, Tax_Credits: {row['Tax_Credits']}, "
               f"Estimated_Tax: {row['Estimated_Tax']}")

    documents.append(Document(page_content=content))

In [6]:
documents=documents[:100]

In [7]:
hg_embeddings = HuggingFaceEmbeddings()
persist_directory = '/content/'

langchain_chroma = Chroma.from_documents(
    documents=documents,
    collection_name="financial_data",
    embedding=hg_embeddings,
    persist_directory=persist_directory
)

  hg_embeddings = HuggingFaceEmbeddings()
  hg_embeddings = HuggingFaceEmbeddings()
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
from torch import cuda, bfloat16, float16
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from langchain.llms import HuggingFacePipeline
from time import time

model_id = 'HuggingFaceH4/zephyr-7b-beta'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [9]:
query_pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=float16,
    max_new_tokens=500,
    device_map="auto",
)
llm = HuggingFacePipeline(pipeline=query_pipeline)

  llm = HuggingFacePipeline(pipeline=query_pipeline)


In [10]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

template = """
Based on the following financial data and tax regulations, analyze and provide personalized tax-saving recommendations:
Financial Data: {question}
Context: {context}
Answer:
"""
PROMPT = PromptTemplate(input_variables=["context", "query"], template=template)

# Set up retriever
retriever = langchain_chroma.as_retriever(search_kwargs={"k": 5})

# Function to remove duplicates from retrieved documents
def remove_duplicates(documents):
    seen = set()
    unique_docs = []
    for doc in documents:
        if doc.page_content not in seen:
            unique_docs.append(doc)
            seen.add(doc.page_content)
    return unique_docs

# Set up the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=retriever, chain_type_kwargs={"prompt": PROMPT}
)

def get_tax_optimization_recommendations(query):
    # Retrieve documents
    raw_docs = retriever.get_relevant_documents(query)

    # Remove duplicates
    unique_docs = remove_duplicates(raw_docs)

    # Prepare the context for the prompt
    context = " ".join([doc.page_content for doc in unique_docs])

    # Use the QA chain to get the response
    result = qa_chain({"context": context, "query": query})
    return result

# Example query
query = "Analyze - User_ID: 317, Income: 65185.29, Expenses: 6770.46, HealthInsurance: 1921.03, HomeLoan: 0.0, ELSS: 0.0, NPS: 1767.37, PPF: 1927.76, HouseRent: 3657.13, Previous_Tax_Amount: 15957.37, State: VI, Filing_Status: Head of Household, Tax_Credits: 2990.91, Estimated_Tax: 9660.64"
response = get_tax_optimization_recommendations(query)

  raw_docs = retriever.get_relevant_documents(query)
  result = qa_chain({"context": context, "query": query})


In [11]:
print(response['result'])


Based on the following financial data and tax regulations, analyze and provide personalized tax-saving recommendations:
Financial Data: Analyze - User_ID: 317, Income: 65185.29, Expenses: 6770.46, HealthInsurance: 1921.03, HomeLoan: 0.0, ELSS: 0.0, NPS: 1767.37, PPF: 1927.76, HouseRent: 3657.13, Previous_Tax_Amount: 15957.37, State: VI, Filing_Status: Head of Household, Tax_Credits: 2990.91, Estimated_Tax: 9660.64
Context: User_ID: 79, Income: 1886113.98, Expenses: 55661.67, HealthInsurance: 0.0, HomeLoan: 0.0, ELSS: 0.0, NPS: 64603.57, PPF: 0.0, HouseRent: 105264.24, Previous_Tax_Amount: 24195.14, State: IN, Filing_Status: Head of Household, Tax_Credits: 84468.99, Estimated_Tax: 492373.85099999997

User_ID: 40, Income: 1949207.75, Expenses: 45544.47, HealthInsurance: 0.0, HomeLoan: 684587.19, ELSS: 0.0, NPS: 111908.01, PPF: 96219.11, HouseRent: 0.0, Previous_Tax_Amount: 172727.7, State: AS, Filing_Status: Head of Household, Tax_Credits: 92603.14, Estimated_Tax: 294448.03199999995

Us