# Chatbot with memory

A question-answering chatbot designed to handle complex bank statement data, utilizing LangChain, LanceDB, and PyMuPDF4LLM models.

# Install necessary packages


In [None]:
!pip install langchain openai
!pip install -qU langchain-text-splitters
!pip install pypdf
!pip install langchain_community
!pip install pymupdf
!pip install lancedb
!pip install pymupdf4llm
!pip install "unstructured[md]"
!pip install -U langchain-openai langchain-community
!pip install gradio
!pip install tantivy


Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.35.10-py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.3/328.3 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.10 (from langchain)
  Downloading langchain_core-0.2.11-py3-none-any.whl (337 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.4/337.4 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.83-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.5/127.5 kB[0m [31m10.7 MB/s[0m 

## Download the data

In [None]:
# Openai api key
OPENAI_API_KEY = ""


In [None]:
!wget https://github.com/akashAD98/dummy_data/raw/main/sample_credit_card.pdf

--2024-07-07 16:53:04--  https://github.com/akashAD98/dummy_data/raw/main/sample_credit_card.pdf
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/akashAD98/dummy_data/main/sample_credit_card.pdf [following]
--2024-07-07 16:53:05--  https://raw.githubusercontent.com/akashAD98/dummy_data/main/sample_credit_card.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 941934 (920K) [application/octet-stream]
Saving to: ‘sample_credit_card.pdf’


2024-07-07 16:53:05 (15.3 MB/s) - ‘sample_credit_card.pdf’ saved [941934/941934]



Import Packages

In [None]:
import os
import shutil
from datetime import datetime
from tempfile import mkdtemp
from typing import List

import gradio as gr
import pymupdf4llm
from pydantic import BaseModel, Field, validator

from langchain.chains.question_answering import load_qa_chain
# from langchain.community.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.llms import OpenAI
from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings


from langchain_community.vectorstores import LanceDB
from lancedb.rerankers import LinearCombinationReranker


from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from lancedb.rerankers import LinearCombinationReranker



We are loading a PDF file and converting it to Markdown format, as LLMs (Language Learning Models) excel at understanding Markdown language. Due to the complexity of our PDF, we are employing this method. We are using the `pymupdf4llm` library, which is specifically designed for this purpose. For more information, you can refer to the documentation [here](https://pymupdf4llm.readthedocs.io/en/latest/).

In [None]:
# Convert PDF to Markdown
md_text = pymupdf4llm.to_markdown("/content/sample_credit_card.pdf", table_strategy="lines_strict")

# Write Markdown string to a file
output = open("output_credit_card.md", "w")
output.write(md_text)
output.close()

print("Converted to .md")

# Load the Markdown file
loader = UnstructuredMarkdownLoader("/content/output_credit_card.md")
documents = loader.load()

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

Converted to .md


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


## Bank Statement Analysis Prompt


In [None]:


# Define the prompt template
template = """
Your ability to extract and summarize this information accurately is essential for effective credit card statement analysis.
You are a financial expert AI chatbot having a conversation with a human.
Your task is to provide accurate and helpful answers based on the extracted parts of a credit card statement.
Pay close attention to the credit card statement's language, structure, and any cross-references to ensure comprehensive and precise extraction of information.
Do not use prior knowledge or information from outside the context to answer the questions.
Only use the information provided in the context to answer the questions.

Credit Card Statement Extract:
{context}

Conversation History:
{chat_history}

Human: {human_input}
Chatbot:
"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input", "context"],
    template=template,
)

In [None]:


# Set up conversation memory - using langchain memory
memory = ConversationBufferMemory(memory_key="chat_history", input_key="human_input")


# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


# Load the QA chain
chain = load_qa_chain(
    OpenAI(temperature=0, openai_api_key= OPENAI_API_KEY)
    chain_type="stuff",
    memory=memory,
    prompt=prompt
)

# Define the query
query = "what is the minimum payment due"

# Initialize the re-ranker model
reranker = LinearCombinationReranker(weight=0.3)

# Perform similarity search with LanceDB
docsearch = LanceDB.from_documents(chunks, embeddings, reranker=reranker)


docs = docsearch.similarity_search(query)
docs_score = docsearch.similarity_search_with_relevance_scores(query)
print("Relevance score - ", docs_score[0][1])
print("Text - ", docs_score[0][0].page_content[:1000])


  warn_deprecated(
  warn_deprecated(


Relevance score -  0.7782929097294156
Text -  minimum payment each period, you will pay more in interest and it will take you longer
to pay off your Non-Plan Balance. For example:

New Balance $10,269.65
Minimum Payment Due $205.39

Credit Limit $26,400.00
Available Credit $16,130.35

Cash Advance Limit $4,600.00
Available Cash $4,600.00

If you make no additional
charges and each month
you pay...

You will pay off the balance
shown on this statement in
about...

And you will pay an
estimated total of...

Only the
22 years $29,830
Minimum Payment Due

$14,640
$407 3 years (Savings = $15,190)

If you would like information about credit counseling services, call 1-888-733-4139.

See page 2 for important information about your account.

Please refer to the IMPORTANT NOTICES section on

page 7.

Continued on page 3

Please fold on the perforation below, detach and return with your payment

Payment Coupon Pay by Computer Pay by Phone Account Ending 7-73045


# some example test

In [None]:
op_statment = chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)

op_statment



  warn_deprecated(


{'output_text': 'The minimum payment due on this credit card statement is $205.39. If you only make the minimum payment each period, you will end up paying more in interest and it will take you longer to pay off your Non-Plan Balance. For example, with a New Balance of $10,269.65, it would take approximately 22 years to pay off and you would end up paying an estimated total of $29,830.'}

In [None]:

query = "What is the available credit for this credit card "

op_statment = chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)

from IPython.display import Markdown, display
display(Markdown(op_statment['output_text']))


The available credit for this credit card is $16,130.35. This is the amount of credit that you have left to use on your card, after taking into account your current balance and any pending charges. It is important to keep track of your available credit to avoid going over your credit limit and incurring fees.

In [None]:

query = "List all the fees and interest charged in this statement period "


op_statment = chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)

from IPython.display import Markdown, display
display(Markdown(op_statment['output_text']))



The fees and interest charged in this statement period are $0.00. This includes any fees for late payments or cash advances, as well as any interest charged on your balance. It is important to pay your balance in full each month to avoid these fees and interest charges.

In [None]:
query = "What is the new balance for the current statement? "


op_statment = chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)

from IPython.display import Markdown, display
display(Markdown(op_statment['output_text']))



The new balance for the current statement is $10,269.65. This includes any new charges made during the statement period, as well as any previous balance and payments/credits. It is important to pay off your balance in full each month to avoid carrying a balance and incurring interest charges.

In [None]:
query = "What is the payment due date?"

op_statment = chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)

from IPython.display import Markdown, display
display(Markdown(op_statment['output_text']))



The payment due date for this credit card statement is 10/22/23. It is important to make your payment by this date to avoid late fees and potential increases in your APR. You can make your payment online, by phone, or by mail using the payment coupon provided on the statement.

Gradio app with file upload support

In [None]:

import os
import pymupdf4llm
import gradio as gr
from langchain.llms import OpenAI
from tempfile import mkdtemp
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import LanceDB
from lancedb.rerankers import LinearCombinationReranker
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredMarkdownLoader



OPENAI_API_KEY = 'your_api_key"'
# Temporary directory for file processing
temp_dir = mkdtemp()

# Define global variables for embeddings and docsearch
embeddings = None
docsearch = None

# Function to process the uploaded PDF file
def process_file(file):
    global embeddings, docsearch

    if file is None:
        return "No file uploaded. Please upload a PDF file."

    # Convert PDF to markdown
    md_text = pymupdf4llm.to_markdown(file.name, table_strategy="lines_strict")

    # Write markdown string to a file in the temporary directory
    md_file_path = os.path.join(temp_dir, "output_credit_card.md")
    with open(md_file_path, "w") as output:
        output.write(md_text)

    # Load the markdown file
    loader = UnstructuredMarkdownLoader(md_file_path)
    documents = loader.load()

    # Split the document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(documents)

    # Setup embeddings and vector store
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    reranker = LinearCombinationReranker(weight=0.3)
    docsearch = LanceDB.from_documents(chunks, embeddings, reranker=reranker)

    return "File processed successfully. You can now ask questions about the credit card statement."

# Function to process the query
def process_query(query):
    if docsearch is None:
        return "No document has been uploaded and processed yet. Please upload a PDF first."

    docs = docsearch.similarity_search(query)
    top_docs = docs[:2]  # Get only the top 2 documents
    response = chain({"input_documents": top_docs, "human_input": query}, return_only_outputs=True)
    return response['output_text']

# Define the prompt template
template = """
Your ability to extract and summarize this information accurately is essential for effective credit card statement analysis.
You are a financial expert AI chatbot having a conversation with a human.
Your task is to provide accurate and helpful answers based on the extracted parts of a credit card statement.
Pay close attention to the credit card statement's language, structure, and any cross-references to ensure comprehensive and precise extraction of information.
Do not use prior knowledge or information from outside the context to answer the questions.

If the human greets you then respond with a polite greeting.
If the question is not related to the credit card statement, respond with "Sorry, I don't know. Please ask questions related to the provided credit card statement."

Only use the information provided in the context to answer the questions.

Credit Card Statement Extract:
{context}

Conversation History:
{chat_history}

Human: {human_input}
Chatbot:
"""

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input", "context"],
    template=template
)

# Set up the memory and chain
memory = ConversationBufferMemory(memory_key="chat_history", input_key="human_input")
chain = load_qa_chain(
    OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
    chain_type="stuff",
    memory=memory,
    prompt=prompt
)

# Define the Gradio interfaces
upload_interface = gr.Interface(
    fn=process_file,
    inputs=gr.File(label="Upload your credit card statement PDF"),
    outputs="text",
    title="Credit Card Statement Upload",
    description="Upload your credit card statement PDF file to analyze."
)

chat_interface = gr.Interface(
    fn=process_query,
    inputs=gr.Textbox(lines=2, placeholder="Enter your query here..."),
    outputs=gr.Textbox(),
    title="Credit Card Statement Analysis",
    description="Ask questions about the credit card statement."
)

iface = gr.TabbedInterface(
    interface_list=[upload_interface, chat_interface],
    tab_names=["Upload PDF", "Chat with AI"]
)

# Launch the interface
iface.launch(share=True, debug=True)

# Clean up the temporary directory on exit
import atexit

def cleanup_temp_dir():
    shutil.rmtree(temp_dir)

atexit.register(cleanup_temp_dir)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://5d478a3efe1cef1d7b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
