In [1]:
%pwd

'/Users/kunal/VsCode/Python/medical-chatbot/research'

In [2]:
import os
os.chdir("../") # change directory to load book from data directory

In [3]:
%pwd


'/Users/kunal/VsCode/Python/medical-chatbot'

In [4]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter # for chunk operation


In [5]:
# Extract Data from the PDF file

def load_pdf_file(data):
  loader = DirectoryLoader(data,
                           glob = "*.pdf",
                           loader_cls = PyPDFLoader)

  documents=loader.load()
  return documents

In [6]:
extracted_data = load_pdf_file(data='/Users/kunal/VsCode/Python/medical-chatbot/Data')

In [7]:
# extracted_data

In [8]:
# split the data into the text Chunks
def text_split(extracted_data):
  text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
  text_chunks=text_splitter.split_documents(extracted_data)
  return text_chunks

In [9]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks : ", len(text_chunks))

Length of Text Chunks :  40000


In [10]:
# text_chunks

Now I have to use a embedding model to perform vector embeddings over text chunks

In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

In [12]:
# Download Embeddings from Hugging Face

def download_hugging_face_embeddings():
  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  return embeddings

In [13]:
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm
Downloading .gitattributes: 1.23kB [00:00, 1.05MB/s]
Downloading config.json: 100%|██████████| 190/190 [00:00<00:00, 562kB/s]
Downloading README.md: 10.5kB [00:00, 10.6MB/s]
Downloading config.json: 100%|██████████| 612/612 [00:00<00:00, 3.54MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 264kB/s]
Downloading data_config.json: 39.3kB [00:00, 21.9MB/s]
Downloading model.safetensors: 100%|██████████| 90.9M/90.9M [00:14<00:00, 6.47MB/s]
Downloading model.onnx: 100%|██████████| 90.4M/90.4M [00:15<00:00, 5.70MB/s]
Downloading model_O1.onnx: 100%|██████████| 90.4M/90.4M [00:11<00:00, 7.85MB/s]
Downloading model_O2.onnx: 100%|██████████| 90.3M/90.3M [00:11<00:00, 8.20MB/s]
Downloading model_O3.onnx: 100%|██████████| 90.3M/90.3M [00:10<00:00, 8.79MB/s]
Downloading model_O4.onnx: 100%|██████████| 45.2M/45.2M [00:05<00:00, 8

In [14]:
# from langchain_community.embeddings import HuggingFaceEmbeddings

# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# vector = embeddings.embed_query("medical chatbot test")

# print("Embedding dimension:", len(vector))  # should print 384


In [15]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector = embeddings.embed_query("medical chatbot test")

print("Embedding dimension:", len(vector))  # should print 384


Embedding dimension: 384


## Creating knowledge base from embeddings

In [29]:
from dotenv import load_dotenv
load_dotenv()

True

In [30]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [18]:
from pinecone.grpc import PineconeGRPC as Pinecone 
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"

pc.create_index(
  name=index_name,
  dimension=384,
  metric="cosine",
  spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  )
)

# after executing this code, index is automatically created  in pinecone

{
    "name": "medicalbot",
    "metric": "cosine",
    "host": "medicalbot-6y1c00f.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [31]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [20]:
# Embed each chunk and upsert the embeddings into our Pinecone Index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
  documents=text_chunks,
  index_name=index_name,
  embedding=embeddings
)

In [21]:
# Load Existing Index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
  index_name=index_name,
  embedding=embeddings
)


In [22]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x30c5b7e20>

In [23]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [24]:
# asking questions
retrieved_docs = retriever.invoke("What is Acne?")

In [26]:
retrieved_docs

[Document(id='fc8a5e6b-6d80-4da4-9ada-8e4fd59266ea', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': '/Users/kunal/VsCode/Python/medical-chatbot/Data/The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf', 'total_pages': 4505.0}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='587ddd11-3795-443d-b40b-42bf47669395', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': '/Users/kunal/VsCode/Python/medical-chatbot/Data/The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf', 'total_pages': 4505.0}, page_content='Sebaceous follicles— A structure found within the\ns

## Now Intitalize llm model 

In [38]:
# from langchain_openai import OpenAI
# llm = OpenAI(temperature=0.4,max_tokens=500)

In [39]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="mistral",                # the model you pulled with Ollama
    temperature=0.4,
    max_tokens=500,
    openai_api_key="NA",            # dummy key (not needed for Ollama)
    openai_api_base="http://localhost:11434/v1"  # Ollama's OpenAI-like endpoint
)


In [40]:
# from langchain.chains import create_retrieval_chain
# from langchain.chains.combine_documents import create_stuff_documents_chain
# from langchain_core.prompts import ChatPromptTemplate

# ### ROle of RAG Application . That I need to know

# system_prompt = (
#   "You are an assistant for question_answering tasks. "
#   "Use the following pieces of retrieved context to answer "
#   "the question. If you don't know the answer, say that you "
#   "don't know. Use three sentences maximumand keep the "
#   "answer concise. "
#   "\n\n"
#   "{context}"
# )

# prompt = ChatPromptTemplate.from_messages(
#   [
#     ("system", system_prompt),
#     ("human", "{input}"),
#   ]
# )

In [41]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
  "You are an assistant for question_answering tasks. "
  "Use the following pieces of retrieved context to answer "
  "the question. If you don't know the answer, say that you "
  "don't know. Use three sentences maximum and keep the "
  "answer concise. "
  "\n\n"
  "{context}"
)

prompt = ChatPromptTemplate.from_messages(
  [
    ("system", system_prompt),
    ("human", "{input}"),
  ]
)


In [None]:
# question_answer_chain = create_stuff_documents_chain(llm, prompt)
# rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [42]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [43]:
# response = rag_chain.invoke({"input" : "What is Acne?"})
# print(response["answer"])

In [46]:
response = rag_chain.invoke({"input": "I used to smoke weed daily and now I want to detox myself .. what i need to do ?"})
print(response["answer"])


 To detoxify from smoking weed, you should follow these steps:

1. Abstain from using weed completely for a period of time (usually several weeks) to allow the body to eliminate the remaining THC (the active ingredient in marijuana).
2. Eliminate exposure to all toxic substances, including cannabis smoke, chemicals, and other pollutants.
3. Maintain a healthy diet and exercise regularly to aid in the detoxification process and support overall health.
4. After detoxification, it's important to consider seeking professional help for addiction treatment and long-term recovery programs to prevent relapse.
