In [5]:
import PyPDF2

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import LlamaCpp
#from langchain.llms import LlamaCpp


from langchain.embeddings import HuggingFaceEmbeddings, GPT4AllEmbeddings # import hf embedding
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer, util
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# Step 1: Preparing pdf metadata

In [6]:
pdf_files=["C:/Users/Mrinal Kalita/Python Projects/AIML Capstone Project - CV - Pneumonia Detection-1.pdf"]

In [7]:
def process_pdf(pdf_files):
    documents = []
    metadata = []
    content = []

    for i in pdf_files:

        pdf_read = PyPDF2.PdfReader(i)
        for ind, text in enumerate(pdf_read.pages):
            doc_page = {'title': i + " page " + str(ind + 1),
                        'content': pdf_read.pages[ind].extract_text()}
            documents.append(doc_page)
    for doc in documents:
        content.append(doc["content"])
        metadata.append({
            "title": doc["title"]
        })
    print("Content and metadata are extracted from the documents")
    return content, metadata

In [8]:
content, metadata = process_pdf(pdf_files)

Content and metadata are extracted from the documents


# Step 2: Split the content into smaller portion

In [9]:
def split_content(content, metadata):
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=512,chunk_overlap=256)
    smaller_docs = splitter.create_documents(content, metadatas=metadata)
    print(f"Docs are split into {len(smaller_docs)} passages")
    return smaller_docs

In [10]:
smaller_docs=split_content(content, metadata)

Docs are split into 7 passages


# Step 3: Ingest into Vector Database locally

In [15]:
def ingest_into_vectordb(smaller_docs):
    emb = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
    #emb = GPT4AllEmbeddings(model_name='all-MiniLM-L6-v2.gguf2.f16.gguf', gpt4all_kwargs={'allow_download': 'True'})
    db = FAISS.from_documents(smaller_docs, emb)

    DB_FAISS_PATH = 'vectorstore/db_faiss'
    db.save_local(DB_FAISS_PATH)
    return db

In [16]:
vector =ingest_into_vectordb(smaller_docs)

# Step4 : LLM Prompt conversation

In [26]:
template = """[INST]
As an AI expert, based on the provided document,please provide accurate, important and relevant information. Your responses should follow the following guidelines:
- Answer the question based on the provided documents.
- Be direct, factual and precise while answering, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no etc.
- Maintain an ethical, unbiased and neutral tone, avoiding harmful or offensive content.
- If the document does not contain relevant information, state "The document doesn't have any relevent information avilable."
- Do not include questions in your responses.
- Answer the questions directly. do not ask me questions
{question}
[/INST]
"""

#template = """Given the document and the current conversation between a user and an agent, your task is as follows: Answer any user query by using information from the document. The response should be detailed."""
callback = CallbackManager([StreamingStdOutCallbackHandler()])
def conversation_func(vector):
    llama_llm = LlamaCpp(
    model_path="C:/Users/Mrinal Kalita/langchain-notes/mistral-7b-openorca.gguf2.Q4_0.gguf",
    temperature=0.75,
    max_tokens=200,
    top_p=1,
    callback_manager=callback,
    n_ctx=3000)

    retriever = vector.as_retriever()
    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True, output_key='answer')

    conversation_chat = (ConversationalRetrievalChain.from_llm
                          (llm=llama_llm,
                           retriever=retriever,
                           #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                           memory=memory,
                           return_source_documents=True))
    print("Conversation function created for the LLM using the vector store")
    return conversation_chat

In [27]:
con = conversation_func(vector)

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from C:/Users/Mrinal Kalita/langchain-notes/mistral-7b-openorca.gguf2.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Open-Orca_Mistral-7B-OpenOrca
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_lo

Conversation function created for the LLM using the vector store


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | 
Model metadata: {'general.name': 'Open-Orca_Mistral-7B-OpenOrca', 'general.architecture': 'llama', 'llama.context_length': '32768', 'llama.rope.dimension_count': '128', 'llama.embedding_length': '4096', 'llama.block_count': '32', 'llama.feed_forward_length': '14336', 'llama.attention.head_count': '32', 'tokenizer.ggml.eos_token_id': '32000', 'general.file_type': '2', 'llama.attention.head_count_kv': '8', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.freq_base': '10000.000000', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.add_bos_token': 'true', 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.chat_template': "{% if not add_generation_

# Step 5: etect Hallucination in the LLMs Response

In [28]:
def validate_answer_against_sources(response_answer, source_documents):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    similarity_threshold = 0.5  
    source_texts = [doc.page_content for doc in source_documents]

    answer_embedding = model.encode(response_answer, convert_to_tensor=True)
    source_embeddings = model.encode(source_texts, convert_to_tensor=True)

    cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)


    if any(score.item() > similarity_threshold for score in cosine_scores[0]):
        return True  

    return False  

# Asking Quetions to chatbot

In [30]:
user_question = "what is the objective of this project?"
response=con({"question": user_question})
print("Q: ",user_question)
print("A: ",response['answer'])

Llama.generate: prefix-match hit


 The objective of this project is to design a DL-based algorithm for detecting pneumonia.


llama_print_timings:        load time =    2346.35 ms
llama_print_timings:      sample time =       6.05 ms /    21 runs   (    0.29 ms per token,  3470.50 tokens per second)
llama_print_timings: prompt eval time =  305682.63 ms /   956 tokens (  319.75 ms per token,     3.13 tokens per second)
llama_print_timings:        eval time =    9706.50 ms /    20 runs   (  485.32 ms per token,     2.06 tokens per second)
llama_print_timings:       total time =  292745.25 ms /   976 tokens


Q:  what is the objective of this project?
A:   The objective of this project is to design a DL-based algorithm for detecting pneumonia.


In [31]:
if response['source_documents']:
    response_answer = response['answer']
    source_docs = response['source_documents']

    # Post-processing step to validate the answer against the source documents
    is_valid_answer = validate_answer_against_sources(response_answer, source_docs)
    if not is_valid_answer:
        response['answer'] = "Sorry I can not answer the question based on the given documents"
else:
    response['answer'] ="Sorry, I cannot answer the question based on the given documents"

print("Q: ",user_question)
print("A: ",response['answer'])

Q:  what is the objective of this project?
A:   The objective of this project is to design a DL-based algorithm for detecting pneumonia.
