In [3]:
import os
import pandas as pd

In [4]:
data = pd.read_excel('data/SampleQuestions.xlsx')

In [5]:
data.head(2)

Unnamed: 0,Question,Ideal Answer
0,What are the documents required to apply for t...,If you have Aadhaar card\nNo other document is...
1,What is the cost/fees of a PAN card?,The cost of applying for a new PAN card is Rs ...


### Langchain dataLoader

In [6]:
from langchain.document_loaders import TextLoader

loader = TextLoader("data/data.txt")
doc = loader.load()

### Splittng the text into multiple paragraphs such that each question consist of single para

In [7]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_documents(doc)
len(texts)
for i in texts[:3]:
    print(i)
    print("**"*60)
    print('\n\n')

page_content='# About Pan Card\n\n### What is Pan card?\n\nThe PAN card is a unique ten-digit alphanumeric identification number that is issued by the Income Tax Department of India to track the tax-related transactions of individuals and entities. The PAN card is mandatory for any financial transaction in India, including opening a bank account, buying or selling property, and filing income tax returns.\n\n### Who needs a Pan card?\n\nAll individuals/non-individuals (including foreign citizens/entities) earning taxable income in India\xa0must have a PAN card.\n\n### Types of PAN cards\n\nIn India, two types of PAN cards are available: e-PAN card and physical PAN card.' metadata={'source': 'data/data.txt'}
************************************************************************************************************************



page_content="### Types of PAN cards\n\nIn India, two types of PAN cards are available: e-PAN card and physical PAN card.\n\n1. e-PAN card: An e-PAN card is a d

### Converting each para into embeddings 
1. Sentence transformers [384] dimensions
2. Open AI [1536] dimensions

In [8]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.embeddings import OpenAIEmbeddings

# embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings = OpenAIEmbeddings()

#### Storing these Embeddings in Pinecone for fast retrieval 
Explored:
1. Pinecone
2. Faiss

In [9]:
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key=os.environ.get('PINE_KEY'),
    environment='us-west4-gcp-free'
)

index_name = 'openai'

index = Pinecone.from_existing_index(index_name,embedding=embeddings)

  from tqdm.autonotebook import tqdm


#### Getting context from question

In [10]:
def get_similiar_docs(query,k=1,score=False):
    if score:
        similar_docs = index.similarity_search_with_score(query,k=k)
    else:
        similar_docs = index.similarity_search(query,k=k)
    return similar_docs

In [11]:
question="What are the charges for pan card details correction?"

In [12]:
document = get_similiar_docs(question,1)[0]
ctx=document.page_content

In [13]:
document

Document(page_content='### Cost of new PAN card\n\nThe PAN CARD Application through ABC costs Rs 2500 for E-PAN, and if you want it to be couriered, it will cost Rs 1200 extra for physical delivery to your address.\n\n- e-PAN Card cost: INR 2500\n- Physical PAN Card cost: INR 3700\n\n### Time required to issue PAN card\n\n****If you have Aadhaar card****\n\nYou can get a Pan Card instantly\xa0**(in under 10 minutes)**, if you have an Aadhaar card. You can apply through ABC.\n\n********************************************************************If you don’t have an Aadhaar card********************************************************************\n\nOnce the payment is made to ABC, we will contact you and initiate the process. Pan card will be issued in 3 weeks.\n\n## Updation/Correction in the PAN Card\n\n### Information that can be updated in the PAN Card\n\n- Your name\n- Father’s name\n- Date of Birth\n- Citizenship\n- Photograph\n- Signature\n- Gender\n- Address\n- Contact details\n\

In [14]:
from langchain import PromptTemplate

question_prompt_template = """
                    Answer eloboratively the question using the provided context. \n\n
                    Context: \n {context} \n
                    Question: \n {question} \n
                    Answer:
                    """



question_prompt = PromptTemplate(
    template=question_prompt_template, input_variables=["context", "question"]
)

In [15]:
question="What are the charges for pan card details correction?"

### Defining LLM model

In [16]:
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
llm = Llama(model_path="ggml-alpaca-7b-q4.bin")

llama.cpp: loading model from ggml-alpaca-7b-q4.bin
llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this
llama_model_load_internal: format     = 'ggml' (old version with low tokenizer quality and no mmap support)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size = 4113739.11 KB
llama_model_load_internal: mem required  = 5809.32 MB (+ 1026.00 MB per state)
...................................................................................................
.
llama_init_

In [17]:
def localLLM(context,question):
    ins = '''Answer eloboratively the question using the provided context. \n\n 
    Context:{0}

    Question:  {1}?
    '''
    instruction = ins.format(context[0][:800] , question)
    result=""
    for x in llm(instruction, stream=True, temperature=0.1, top_p=0.95, top_k=40):
        result += x['choices'][0]['text']

    return result

### Final RAG for answering 

In [18]:
def RAG(question):
    document = get_similiar_docs(question,1)[0]
    ctx=document.page_content
    return localLLM(ctx,question)


In [19]:
def sampleQuestion(idx):
    question = data.Question.values[idx]+'?'
    answer = RAG(question)
    print(question)
    print('\n')
    print(answer)
    return answer

## Output

In [20]:
sampleQuestion(0)

What are the documents required to apply for the new pan?


Answer: The documents required to apply for a new PAN card include proof of identity, proof of address and two passport-size photographs.


'Answer: The documents required to apply for a new PAN card include proof of identity, proof of address and two passport-size photographs.'

In [21]:
sampleQuestion(2)

Can I take the delivery of Pan card at Indian address?


Answer: Yes, you can take the delivery of your Pan card at an Indian address.


'Answer: Yes, you can take the delivery of your Pan card at an Indian address.'

In [46]:
sampleQuestion(3)

How long does it usually take to receive the PAN card after applying??


Answer:It usually takes about 10-15 days to receive the PASS card after applying.


In [47]:
sampleQuestion(14)

Why do NRIs need PAN card??


Answer:NRIs need a PAN card to fulfill their income tax obligations in India, as well as to open bank accounts and invest in Indian financial instruments. It also serves as an identity and address proof for various official purposes.


In [22]:
results=[]
for i in range(len(data)):
    results.append(sampleQuestion(i))

What are the documents required to apply for the new pan?


Answer: The documents required to apply for a new PAN card are an application in the prescribed form, proof of identity such as passport or driver's license, proof of address such as bank statement or utility bill, and a recent photograph.
What is the cost/fees of a PAN card??


Answer: The cost/fees for a PAN card varies depending on the type of card and the processing charges. Generally, the cost includes a one-time application fee, an annual maintenance fee, and other miscellaneous charges.
Can I take the delivery of Pan card at Indian address?


Answer: Yes, you can take the delivery of your Pan card at your Indian address.
How long does it usually take to receive the PAN card after applying??


Answer:It usually takes about 10-15 days to receive the PASS card after applying.
How to apply for PAN card?


Answer: To apply for a PASS Card, you will need to fill out an application form and provide proof of identity and nation

In [23]:
data['modelPrediction'] = results

In [24]:
for i in data.values[3]:
    print(i)
    print('\n')

How long does it usually take to receive the PAN card after applying?


Once the payment is made, we will contact you and initiate the process. Pan card will be issued in 3 weeks after submitting the application. 

(You can get an e-Pan Card in under 10 minutes, if you have an Aadhaar card.)


Answer:It usually takes about 10-15 days to receive the PASS card after applying.




### calculating bleu score

In [28]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

def bleu_scorer(reference,candidate):
    # Convert sentences to lists of tokens
    reference_tokens = [reference.split()]
    candidate_tokens = candidate.split()

    # Calculate BLEU score
    bleu_score = sentence_bleu(reference_tokens, candidate_tokens)

    return bleu_score


In [43]:
score =[]
for i in data.values:
    score.append(bleu_scorer(i[1].lower(),i[2].lower()))

In [44]:
data['bleu_score'] = score

In [45]:
data.bleu_score.mean()

0.03096415971289323

In [47]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate import gleu_score


def gleu_scorer(reference,candidate):
    # Tokenize sentences
    reference_tokens = reference.split()
    candidate_tokens = candidate.split()

    # Calculate GLEU score
    gleu_score_ = gleu_score.sentence_gleu([reference_tokens], candidate_tokens)

    return gleu_score_


In [48]:
score =[]
for i in data.values:
    score.append(gleu_scorer(i[1].lower(),i[2].lower()))
data['gleu_score'] = score

In [49]:
data.gleu_score.mean()

0.09246692733865593

In [46]:
data.to_csv('results/alpaca_results.csv',index=None)