In [113]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from transformers import GPT2TokenizerFast
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import HuggingFaceHub
from langchain.chains import ConversationalRetrievalChain

from dotenv import load_dotenv
load_dotenv()

True

In [114]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')

file_name = './data/sbc-sample.pdf'
output_file = './output/sbc-sample.txt'

In [115]:
# You MUST add your PDF to local files in this notebook (folder icon on left hand side of screen)

# Simple method - Split by pages 
loader = PyPDFLoader(file_name)
pages = loader.load_and_split()
print(pages[0])

# SKIP TO STEP 2 IF YOU'RE USING THIS METHOD
chunks = pages

page_content='1 of 8 Insurance Company 1: Plan Option 1  Coverage Period: 01/01/2013 – 12/31/2013 \nSummary of Benefits and Coverage: What this Plan Covers & What it Costs  Coverage for: Individual + Spouse | Plan Type: PPO \nQuestions:  Call 1-800-[insert] or visit us at www .[insert] . \nIf you aren’t clear about any of the underlined terms used in this form, see the Glossary.  You can view the Glossary \nat www.[insert]  or call 1- 800-[ insert ] to request a copy.   \n This is  only a summary . If you want more detail about you r coverage and costs , you can get the complete terms in the policy or plan \ndocument at www. [insert]  or by calling 1-800-[insert] . \n  \nImportant Questions  Answers  Why this Matters:  \nWhat is the overa ll \ndeductible ? $500 person  /  \n$1,000 family  \nDoesn’t apply to preventive  care  You must pay all the costs up to the deductible  amount before this  plan begins to pay for \ncovered services you use. Check your policy or plan document to see w

In [116]:
# Advanced method - Split by chunk

# Step 1: Convert PDF to text
import textract
doc = textract.process(file_name)

# Step 2: Save to .txt and reopen (helps prevent issues)
with open(output_file, 'w') as f:
    f.write(doc.decode('utf-8'))

with open(output_file, 'r') as f:
    text = f.read()

# Step 3: Create function to count tokens
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))

# Step 4: Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 512,
    chunk_overlap  = 24,
    length_function = count_tokens,
)

chunks = text_splitter.create_documents([text])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [117]:
# # Quick data visualization to ensure chunking was successful

# # Create a list of token counts
# token_counts = [count_tokens(chunk.page_content) for chunk in chunks]

# # Create a DataFrame from the token counts
# df = pd.DataFrame({'Token Count': token_counts})

# # Create a histogram of the token count distribution
# df.hist(bins=40, )

# # Show the plot
# plt.show()

In [118]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding

embeddings = HuggingFaceEmbeddings()
embed_model = LangchainEmbedding(embeddings)

# Create vector database
db = FAISS.from_documents(chunks, embeddings)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [119]:
# # Check similarity search is working
# query = 'What is the copay for Diagnostic test?'
# docs = db.similarity_search(query)
# docs[0]

In [123]:
# Create QA chain to integrate similarity search with user queries (answer query from knowledge base)

# 'google/flan-t5-large'
# 'bigscience/bloom-560m' - NOT WORKING
# 'bigscience/bloom-7b1' - NOT WORKING
# 'MetaIX/GPT4-X-Alpaca-30B-4bit' - NOT WORKING
# 'stanfordnlp/SteamSHP-flan-t5-large' - NOT WORKING
# "THUDM/chatglm-6b-int4" - NOT WORKING

# 'stanfordnlp/SteamSHP-flan-t5-large' - gives wrong answer

flan_ul2 = HuggingFaceHub(repo_id='kireall/facebook-bart-large-xsum-samsum', model_kwargs={'temperature': 0.1})
chain = load_qa_chain(flan_ul2, chain_type="stuff")

query = 'what will be my deductible?'
# query = 'what is my out of pocket limit?'
# query = 'what is the radiology cost for having a baby'
docs = db.similarity_search(query)

chain.run(input_documents=docs, question=query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

ValueError: Error raised by inference API: Input is too long for this model, shorten your input or use 'parameters': {'truncation': 'only_first'} to run the model only on the first part.

In [None]:
# from IPython.display import display
# import ipywidgets as widgets

# # Create conversation chain that uses our vectordb as retriver, this also allows for chat history management
# qa = ConversationalRetrievalChain.from_llm(flan_ul2, db.as_retriever())

In [None]:
# chat_history = []

# def on_submit(_):
#     query = input_box.value
#     input_box.value = ""
    
#     if query.lower() == 'exit':
#         print("Thank you for using the State of the Union chatbot!")
#         return
    
#     result = qa({"question": query, "chat_history": chat_history})
#     chat_history.append((query, result['answer']))
    
#     display(widgets.HTML(f'<b>User:</b> {query}'))
#     display(widgets.HTML(f'<b><font color="blue">Chatbot:</font></b> {result["answer"]}'))

# print("Welcome to the Transformers chatbot! Type 'exit' to stop.")

# input_box = widgets.Text(placeholder='Please enter your question:')
# input_box.continuous_update = False
# input_box.observe(on_submit, names='value')
# # input_box.on_submit(on_submit)

# display(input_box)