In [1]:
print("Hello to our notebook")

Hello to our notebook


## Importation

In [9]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader #for the pdf file
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

## Download data

In [5]:
!mkdir data

In [6]:
!cd data
!wget https://ia800907.us.archive.org/21/items/IndianHerbalRemedies_201903/Handbook%20of%20Medicinal%20Herbs.pdf

--2024-07-27 01:55:34--  https://ia800907.us.archive.org/21/items/IndianHerbalRemedies_201903/Handbook%20of%20Medicinal%20Herbs.pdf
Resolving ia800907.us.archive.org (ia800907.us.archive.org)... 207.241.233.67
Connecting to ia800907.us.archive.org (ia800907.us.archive.org)|207.241.233.67|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8011621 (7.6M) [application/pdf]
Saving to: ‘Handbook of Medicinal Herbs.pdf’


2024-07-27 01:56:14 (219 KB/s) - ‘Handbook of Medicinal Herbs.pdf’ saved [8011621/8011621]



In [7]:
!pwd

/home/khaoula1972/Herbal-Medicine-Chatbot-using-Llama2


## Pinecone variables

In [8]:
PINECONE_API_KEY = "a4dc4d0f-f863-48fe-bd65-a02a2118e0b8"
# For the environnment, it's no longer necessary to have an environnement according to the pinecone support system

## Extract data from PDF

In [10]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                   loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [13]:
extracted_data=load_pdf("data/")

Now as we already created and extracted the data, we'll need to create text chunks

## Create text chunks

In [14]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap = 20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [15]:
text_chunks=text_split(extracted_data)
print("length of chunks", len(text_chunks))

length of chunks 7466


## Vector embedding

In [16]:
# download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [17]:
embedding = download_hugging_face_embeddings()



.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [18]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [20]:
# testing the embedding model
query_result = embedding.embed_query("Hello world")
print(len(query_result))

384


In [21]:
query_result

[-0.034477315843105316,
 0.031023172661662102,
 0.006734910886734724,
 0.02610892429947853,
 -0.03936195746064186,
 -0.1603025197982788,
 0.06692396104335785,
 -0.006441440898925066,
 -0.04745054617524147,
 0.014758836477994919,
 0.07087532430887222,
 0.055527545511722565,
 0.01919332519173622,
 -0.026251299306750298,
 -0.01010951679199934,
 -0.026940451934933662,
 0.022307397797703743,
 -0.022226639091968536,
 -0.1496926248073578,
 -0.01749303936958313,
 0.007676327601075172,
 0.054352276027202606,
 0.0032544792629778385,
 0.03172592446208,
 -0.08462144434452057,
 -0.029405953362584114,
 0.05159562826156616,
 0.048124104738235474,
 -0.003314818488433957,
 -0.05827919766306877,
 0.04196928068995476,
 0.02221069671213627,
 0.12818878889083862,
 -0.02233896404504776,
 -0.011656257323920727,
 0.06292840093374252,
 -0.03287629410624504,
 -0.09122602641582489,
 -0.031175386160612106,
 0.05269954726099968,
 0.047034841030836105,
 -0.08420310169458389,
 -0.030056146904826164,
 -0.020744822919

## Pinecone initialisation

In [34]:
from pinecone import Pinecone

pc = Pinecone(api_key="a4dc4d0f-f863-48fe-bd65-a02a2118e0b8")
index_list = pc.list_indexes()
index = pc.Index("herbalmedicinechatbot")

In [48]:
# Given that our chunk size is 500, an ideal batch size for upserting to Pinecone would typically be smaller than your chunk size. The optimal batch size can vary depending on factors like your system's memory, network conditions, and Pinecone's current load. However, a good starting point would be a batch size between 100 to 200.
batch_size = 100  # as advised
for i in range(0, len(text_chunks), batch_size):
    batch = text_chunks[i:i+batch_size]
    ids = [f"id_{j}" for j in range(i, min(i+batch_size, len(text_chunks)))]
    embeddings = embedding.embed_documents([chunk.page_content for chunk in batch])
    metadata = [{"text": chunk.page_content} for chunk in batch]
    
    index.upsert(vectors=zip(ids, embeddings, metadata))

In [83]:
# testing our index
query_vector = embedding.embed_query("what is Mexican bamboo")
results = index.query(vector=query_vector, top_k=3, include_metadata=True)

In [84]:
print(results)

{'matches': [{'id': 'id_7366',
              'metadata': {'text': 'Mescal Bean (Texas Mountain Laurel); '
                                   'Sophora \n'
                                   'secundiﬂora  (Ortega) Lag. ex DC. \n'
                                   '(Synonym: Broussonetia  secundiﬂora  \n'
                                   'Ortega), 499\n'
                                   'Mesquite (Ironwood); Prosopis juliﬂora  '
                                   '(Sw.) DC., \n'
                                   '499\n'
                                   'Metel, Hindu Datura (Downy Thornapple, '
                                   'Hindu \n'
                                   'Thornapple, Hoary Thornapple, Horn-of-\n'
                                   'Plenty, Purple Thornapple); Datura metel  '
                                   'L \n'
                                   '(Synonyms: D. alba  Nees, D. chlorantha  \n'
                                   'Hook., D. fastuosa  L., D. mete

<b>Remarque:</b>
this is not readable so we'll need to generate our correct answer using LLM

## integrating LLM

In [56]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [57]:
PROMPT= PromptTemplate(template=prompt_template, input_variables=["context","question"])
chain_type_kwargs={"prompt": PROMPT}

In [58]:
llm=CTransformers(model="./model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':522,
                          'temperature':0.8})

In [65]:
from langchain.vectorstores import Pinecone as LangchainPinecone
import torch

text_field = "text"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
vectorstore = LangchainPinecone(index, model.encode, text_field)

In [67]:
qa=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={'k':2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs)

In [78]:
results = self._index.query(
    vector=[query_obj],
    top_k=k,
    include_metadata=True,
    namespace=namespace,
    filter=filter
)

In [79]:
while True:
    user_input=input(f"Input Prompt:")

    if user_input.lower() in ["exit", "quit", "q"]:
        print("Exiting the chatbot. Goodbye!")
        break
    
    result=qa1({"query":user_input})
    print("Response:", result["result"])

Input Prompt: what's a Horsemint ?


ValueError: The argument order for `query()` has changed; please use keyword arguments instead of positional arguments. Example: index.query(vector=[0.1, 0.2, 0.3], top_k=10, namespace='my_namespace')

In [80]:
from langchain.llms import LlamaCpp
from langchain.embeddings import LlamaCppEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Create a custom prompt template
prompt_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

# Initialize the RetrievalQA object
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [81]:
def rag_query(question: str) -> str:
    # create query vec
    query_vec = embed_docs(question)[0]
    # query pinecone
    res = index.query(vector=query_vec, top_k=5, include_metadata=True)
    # get contexts
    contexts = [match.metadata["text"] for match in res.matches]
    # build the multiple contexts string
    context_str = construct_context(contexts=contexts)
    # create our retrieval augmented prompt
    payload = create_payload(question, context_str)
    # make prediction
    out = predictor.predict(payload, custom_attributes='accept_eula=true')
    return out[0]["generation"]["content"]

In [82]:
rag_query("What's the HORSEMINT?")

NameError: name 'embed_docs' is not defined